91 Finds and converts troublesome characters into aliases in a string
92 ignore: list of characters to not extract such as ["|",","]
93 Returns new string. Starts with notok: if error occurs
101 for k,v
in char_aliases_reverse.items():
102 if k
not in ignore
and k
in result:
103 result=result.replace(k, v)
104 except (RuntimeError,ValueError,OSError)
as err:
105 result=
"notok:" + str(err)
249 Decompose quoted record line.
250 line_in: string data record
251 delim: name of delimiter (comma, pipe, tab, colon)
252 Returns list of parsed values. If error, 0th entry starts with notok:
270 delim_char= delim_get_char(delim)
273 if len(delim_char)==0
or delim_char.startswith(
"notok:"):
274 raise ValueError(
"incorrect delim:" + delim)
275 dq_delim= dq + delim_char
277 while len(cur_line)>0:
282 nqf= cur_line.find(dq)
284 if -1 < nqf < len(cur_line)-1:
285 nqb= cur_line.find(dq,nqf+1)
286 if delim_char
in cur_line:
287 ndelim= cur_line.find(delim_char)
288 nqdelim= cur_line.find(dq_delim)
290 if cur_line==delim_char:
293 elif len(cur_line)> 0:
294 cur_line= cur_line[1:]
301 elif ndelim==len(cur_line)-1:
302 cur_str=cur_line[:ndelim]
306 cur_str=cur_line[:ndelim]
307 cur_line=cur_line[ndelim+1:]
310 cur_str=cur_line[:ndelim]
311 cur_line=cur_line[ndelim+1:]
315 cur_str=cur_line[nqf:nqdelim]
316 cur_line=cur_line[nqdelim+2:]
321 cur_str=cur_line[:nqdelim]
322 cur_line=cur_line[nqdelim+2:]
324 cur_str=cur_line[nqf:nqb]
325 cur_line=cur_line[nqb+1:]
328 cur_str=cur_line[:ndelim]
329 cur_line=cur_line[ndelim+1:]
331 cur_str=cur_line[nqf+1:]
334 cur_str=cur_str.replace(dq,
'')
339 cur_str=cur_str.replace(dq,
'')
341 out_rec.append(cur_str)
345 except (RuntimeError,ValueError)
as err:
347 out_rec.append(
"notok:" + str(err))
352 Detect Value Datatype
354 Detect a value's data type by looking for known patterns of
355 characters and evaluating likely datatype.
356 Returns datatype or starts with notok: if error
365 raise ValueError(
"no value supplied")
366 strin=strin.lower().strip()
369 if strin
in [
"true",
"false"]:
371 elif strin
in [
"0",
"-0"]:
373 elif strin.startswith(
"00"):
375 elif any(x
in strin
for x
in [
"e+",
"e-",
"e"]):
376 txt= numfuncs.get_value_from_suspect_exp(strin)
377 if len(txt)>0
and txt!=strin
and not txt.startswith(
"notok:"):
384 elif 8<= len(strin) <=30
and (
"/" in strin
or "-" in strin):
385 if strin.startswith(
"/")
or strin.startswith(
"-"):
391 txt1=txt[txt.find(
"t")+1:].strip()
392 txt=txt[:txt.find(
"t")].strip()
396 txt2= datefuncs.convert_date_to_iso(txt,
"",
True)
397 if txt2.startswith(
"notok:")
or len(txt2)<8:
399 elif txt2.startswith(
"00"):
401 elif txt2[4:6]==
"00" or txt2[6:8]==
"00":
403 elif txt2.endswith(
")")
and "(" in txt2:
404 txt2=txt2[txt2.find(
"(")+1:-1]
408 txt1=txt1.replace(
":",
"")
410 txt1=txt1[:txt1.find(
"+")].strip()
412 txt1=txt1[:txt1.find(
"-")].strip()
414 txt1=txt1[:txt1.find(
" ")].strip()
415 if len(txt1)>0
and numfuncs.is_real(txt1):
425 elif "." in strin
and len(strin)>1:
426 if strin.startswith(
"."):
428 if "." not in txt
and numfuncs.is_real(txt):
432 elif numfuncs.is_real(strin):
436 elif numfuncs.is_int(strin):
441 except (RuntimeError,OSError, ValueError)
as err:
442 dtype=
"notok:" + str(err)
447 Uses list of distribution of detected datatypes for each fld to determine the most likely
448 datatype appropriate to assign to it. This uses threshhold settings and knowledge from
449 curated data sets across multiple domains and data systems.
451 datatype_dist_fields: list of dict objects with keys [string, int, real, date, bool, empty]
452 and for each values = number of instances for each fld.
453 This should come from results of analyzequality.do_qualityinspect()
454 settings: dict with keys for various settings including
455 - include_empty: bool whether to include number of empty values in statistical calculation. Default is True
456 - minfrac: real number minimum threshhold in either percentage (any value great than 1) or fraction (0-1). Default is 0.75
458 returns: string list with datatypes (string, int, real, date, bool) per fld (or empty if cannot be determined).
459 0th entry will start with notok: if an error occurs
464 if datatype_dist_fields
is None or not isinstance(datatype_dist_fields, list)
or len(datatype_dist_fields)==0:
465 raise ValueError(
"datatype_dist_fields is not a list")
466 for i
in range(len(datatype_dist_fields)):
467 if datatype_dist_fields[i]
is None or not isinstance(datatype_dist_fields[i], dict):
468 raise ValueError(
"datatype_dist at index " + str(i) +
" is not a dict")
469 elif "int" not in datatype_dist_fields[i]
or not isinstance(datatype_dist_fields[i][
"int"], int):
470 raise ValueError(
"datatype_dist at index " + str(i) +
" missing int key")
471 elif "real" not in datatype_dist_fields[i]
or not isinstance(datatype_dist_fields[i][
"real"], int):
472 raise ValueError(
"datatype_dist at index " + str(i) +
" missing real key")
473 elif "bool" not in datatype_dist_fields[i]
or not isinstance(datatype_dist_fields[i][
"bool"], int):
474 raise ValueError(
"datatype_dist at index " + str(i) +
" missing bool key")
475 elif "date" not in datatype_dist_fields[i]
or not isinstance(datatype_dist_fields[i][
"date"], int):
476 raise ValueError(
"datatype_dist at index " + str(i) +
" missing date key")
477 elif "string" not in datatype_dist_fields[i]
or not isinstance(datatype_dist_fields[i][
"string"], int):
478 raise ValueError(
"datatype_dist at index " + str(i) +
" missing string key")
479 elif "empty" not in datatype_dist_fields[i]
or not isinstance(datatype_dist_fields[i][
"empty"], int):
480 raise ValueError(
"datatype_dist at index " + str(i) +
" missing empty key")
481 for i
in range(len(datatype_dist_fields)):
482 field_dtypes.append(assign_datatype(datatype_dist_fields[i],settings))
483 except (RuntimeError, OSError, ValueError)
as err:
484 field_dtypes.insert(0,
"notok:" + str(err))
489 Uses distribution of detected datatypes for a fld to determine the most likely
490 datatype appropriate to assign to it. This uses threshhold settings and knowledge from
491 curated data sets across multiple domains and data systems.
493 datatype_dist: dict object with keys [string, int, real, date, bool, empty]
494 and for each values = number of instances. This should come from results of analyzequality.do_qualityinspect()
495 settings: dict with keys for various settings including
496 - include_empty: bool whether to include number of empty values in statistical calculation. Default is True
497 - minfrac: real number minimum threshhold in either percentage (any value great than 1) or fraction (0-1). Default is 0.75
499 returns: string with datatype (string, int, real, date, bool) or empty if cannot be determined. will start with notok: if an error occurs
510 ntotal_not_empty:int=0
517 if datatype_dist
is None or not isinstance(datatype_dist, dict):
518 raise ValueError(
"datatype_dist is not a dict")
519 if not settings
is None and isinstance(settings, dict):
520 if "include_empty" in settings
and settings[
"include_empty"].lower() ==
"false":
522 if "minfrac" in settings:
523 minfrac= numfuncs.is_real_get(settings[
"minfrac"],
"number")
524 if 1< minfrac <= 100:
526 elif 0>= minfrac
or minfrac>100:
528 if "int" not in datatype_dist
or not isinstance(datatype_dist[
"int"], int):
529 raise ValueError(
"datatype_dist missing int key")
530 elif "real" not in datatype_dist
or not isinstance(datatype_dist[
"real"], int):
531 raise ValueError(
"datatype_dist missing real key")
532 elif "bool" not in datatype_dist
or not isinstance(datatype_dist[
"bool"], int):
533 raise ValueError(
"datatype_dist missing bool key")
534 elif "date" not in datatype_dist
or not isinstance(datatype_dist[
"date"], int):
535 raise ValueError(
"datatype_dist missing date key")
536 elif "string" not in datatype_dist
or not isinstance(datatype_dist[
"string"], int):
537 raise ValueError(
"datatype_dist missing string key")
538 elif "empty" not in datatype_dist
or not isinstance(datatype_dist[
"empty"], int):
539 raise ValueError(
"datatype_dist missing empty key")
540 dint=datatype_dist[
"int"]
if datatype_dist[
"int"]>0
else 0
541 dreal=datatype_dist[
"real"]
if datatype_dist[
"real"]>0
else 0
542 dbool=datatype_dist[
"bool"]
if datatype_dist[
"bool"]>0
else 0
543 ddate=datatype_dist[
"date"]
if datatype_dist[
"date"]>0
else 0
544 dstr=datatype_dist[
"string"]
if datatype_dist[
"string"]>0
else 0
545 dempty=datatype_dist[
"empty"]
if datatype_dist[
"empty"]>0
else 0
546 ntotal_not_empty= dint + dreal + dbool + ddate + dstr
547 ntotal= ntotal_not_empty + dempty
549 raise ValueError(
"no counts of any datatype")
552 dmin = math.floor(minfrac * ntotal)
553 if dempty>=dmax_count:
557 if ntotal_not_empty==0:
558 raise ValueError(
"no counts of any datatype excluding empty values")
559 dmin = math.floor(minfrac * ntotal_not_empty)
564 if dreal>=dmax_count:
567 if dbool>=dmax_count:
570 if ddate>=dmax_count:
578 if (dint + dreal)>= dmin:
580 dmax_count= dint + dreal
581 if dmax_count<dmin
and inc_empty:
583 if (ddate + dempty)>= dmin:
585 dmax_count= ddate + dempty
586 if dmax_count<dmin
and inc_empty:
588 if (dstr + dempty)>= dmin:
590 dmax_count= dstr + dempty
593 except (RuntimeError,OSError, ValueError)
as err:
594 dtype=
"notok:" + str(err)
599 """IsFieldItsDatatype
600 Determines if a field's value is in its specified datatype
602 dtype: field's defined datatype (int, real, bool, date, string)
603 fieldval: field value
604 datefmt: date format if checking for a date
610 if dtype
is None or fieldval
is None:
611 raise ValueError(
"missing dtype or fieldval since = None")
616 if numfuncs.is_real(fieldval):
619 if numfuncs.is_int(fieldval):
621 elif dtype.startswith(
"date"):
624 elif dtype==
"datetime":
625 result= datefuncs.is_iso_date_str(fieldval,
True).startswith(
"true")
627 result= datefuncs.is_iso_date_str(fieldval,
False).startswith(
"true")
629 result= datefuncs.is_date_format(fieldval, datefmt)
631 result = fieldval.lower()
in (
"true",
"false")
632 elif dtype==
"string":
634 except (OSError, RuntimeError, ValueError)
as err:
636 print(
"ERROR:" + str(err) +
"\n")
642 Determines if field value conforms to its defined format (if set)
644 fieldVal: field value to check
646 allow_empty: bool whether empty values (e.g null) are allowed
647 returns: string as bool:message with bool =(true,false) and message= reason. If error, starts with notok:message
659 raise ValueError(
"missing fieldval since = None")
661 raise ValueError(
"missing field since = None")
663 raise ValueError(
"field is not Field object")
664 fld.datatype= fld.datatype.lower()
665 fld.fmt_strcase= fld.fmt_strcase.lower()
666 if len(fieldval.strip())==0:
669 elif fld.datatype.startswith(
"date"):
670 if len(fld.fmt_date)>0:
676 msg=
"missing date format"
677 elif fld.datatype==
"bool":
682 elif fld.datatype==
"string":
685 if fld.fmt_strlen>0
and len(fieldval)!=fld.fmt_strlen:
687 msg=
"faillength(" + str(len(fieldval)) +
"/" + str(fld.fmt_strlen) +
")"
688 if fld.fmt_strcase
in (
"upper",
"lower"):
689 if fld.fmt_strcase==
"upper":
690 txt= fieldval.upper()
691 elif fld.fmt_strcase==
"lower":
692 txt= fieldval.lower()
697 msg +=
"failcase(" + fld.fmt_strcase +
")"
698 if not flag
and not flag1:
700 elif fld.datatype
in (
"int",
"real"):
701 if fld.datatype==
"int":
702 flag= numfuncs.is_int(fieldval)
704 txt= numfuncs.is_real_get(fieldval,
"string",
False)
705 flag=
not txt.startswith(
"false")
708 elif fld.datatype==
"real" and fld.fmt_decimal>0:
711 n1= len(txt)-1 - txt.index(
".")
712 if n1 != fld.fmt_decimal:
713 msg=
"faildecimal(" + str(n1) +
"/" + str(fld.fmt_decimal) +
")"
718 outmsg= str(result).lower()
721 except (OSError, RuntimeError, ValueError)
as err:
722 outmsg=
"notok:" + str(err)