VerityPy 1.1
Python library for Verity data profiling, quality control, remediation
analyzequality.py
Go to the documentation of this file.
1#!/usr/bin/env python
2"""
3Analyze Quality
4
5Performs deep inspection of data records supplied as List of strings
6which must be delimited. The delimiter should be specified in a dict supplied
7as settings in call to function. Various results are returned in a
8QualityAnalysis object.
9"""
10
11__all__ = ['do_qualityinspect','detect_parsing_error','qc_fields']
12__version__ = '1.0'
13__author__ = 'Geoffrey Malafsky'
14__email__ = 'gmalafsky@technikinterlytics.com'
15__date__ = '20240723'
16
17import copy
18from . import recfuncs, datefuncs, numfuncs, qualityanalysis, field
19
20
21DQ:str= "\""
22LF:str="\n"
23
24def do_qualityinspect(fields:list=None, covalues:list=None, recs:list=None, settings:dict=None) -> qualityanalysis.QualityAnalysis:
25 """
26 Do Quality Inspect. Performs deep inspection of data records to discover and assess
27 a variety of structure, syntax, and semantic problems and inconsistencies.
28 Field information can either be supplied with the 'fields' parameter or extracted from a header line in
29 records. If using 'fields', there can also be specified datatypes and formats per field which will be
30 used to detect errors when values do not meet these rules.
31 fields: list of field objects with attributes-
32 title: field name
33 datatype: int, real, bool, date, string. For date,
34 there should be an entry in field.fmt_date specifying the date format otherwise it is set to ISO yyyyMMdd
35 fmt_strlen: integer number of characters (>0) if a fixed size is required. Ignored if < 0
36 fmt_strcase: (upper, lower, empty)
37 fmt_strcut: (front, back, empty). Used in Refining records. Side to cut characters from if it is larger than specified fmt_strlen. Default is back.
38 fmt_strpad: (front, back, empty). Used in Refining records. Side to add characters to if it is smaller than specified fmt_strlen. Default is back.
39 fmt_strpadchar: single character or character alias (-space-, -fslash-, -bslash-, -tab-). Used in Refining records. Character to add if needed to make too small string meet specified fmt_strlen. Default is _
40 fmt_decimal: number of decimal digits (0-N). Ignored if < 0
41 fmt_date: without time part- yyyymmdd, yymmdd, yyyyddmm, yyddmm, mmddyyyy, mmddyy, ddmmyyyy, ddmmyy
42 (mmm= month abbreviation like Jan) yyyymmmdd, yyyyddmmm, ddmmmyyyy, ddmmmyy
43 (month= full month name like January) yyyymonthdd, yyyyddmonth, ddmonthyyyy, ddmonthyy
44 with time part: suffix to above date formats as (T=letter T, S=space)- Thhmmss, Thhmm, Thh,
45 Shhmmss, Shhmm, Shh like mmddyyyyThhmm for 11282024T1407 or 11/28/2024T14:07 or 11-28-2024 14:07
46 with time zone: if time zone is required at end of time part add suffix Z like mmddyyyyThhmmZ 11282024T1407
47 covalues: optional list of field titles (2-3) for joint value analysis with each list entry as field1,field2 and optionally with ,field3
48 recs: list of records. The delimiter should be specified in the settings object ('delim'= comma,pipe,tab,colon)
49 settings: dictionary object with entries for options to use in inspection. Includes:
50 delim: record delimiter (comma,pipe,tab,colon). Default is comma.
51 is_case_sens: is case sensitive (true,false). Default is false.
52 is_quoted: field values may be enclosed (allows delimiter within) by double quotes (true, false). Default is false.
53 maxuv: optional. string of integer value that is maximum number of unique values per field
54 to collect. Default is 50 and set to default if supplied value <1 or >1000
55 extract_fields: bool whether to read in field titles from header line (first non-comment, non-empty line).
56 Default is False. If True then has_header must also be True, and submitted 'fields' list will only be
57 used to copy its datatype and formatting to the report field object. Thus, you can extract field
58 titles from data set and still define characteristics if desired. If not, ensure 'fields' is empty.
59 has_header: bool whether has header line in file. Default is True. Must be True if extract_fields is True
60 Returns report as a QualityAnalysis class instance.
61 """
62
63 hash_temp:dict={}
65 linein:str=""
66 txt:str=""
67 txt1:str=""
68 txt2:str=""
69 fldname:str=""
70 fldval:str=""
71 fldval2:str=""
72 fldval3:str=""
73 nrec:int=-1
74 nflds:int=-1
75 nfld1:int=-1
76 nfld2:int=-1
77 nfld3:int=-1
78 n1:int=-1
79 f1:float=0
80 f2:float=0
81 rec_has_err:bool=False
82 rec_has_dt_err:bool=False
83 rec_has_fmt_err:bool=False
84 field_values:list=[]
85 tempint:list={}
86 tempbool:list={}
87 hash_srcfields:dict={}
88 try :
89 if recs is None or len(recs)==0:
90 raise ValueError("no records")
91
92 if not settings is None and isinstance(settings, dict):
93 for k,v in settings.items():
94 txt= k.lower()
95 txt1= str(v).lower()
96 if txt in ("is_case_sens","iscasesens"):
97 report.is_case_sens= txt1=="true"
98 elif txt in ("isquoted","is_quoted"):
99 report.is_quoted= txt1=="true"
100 elif txt in ("hasheader","has_header"):
101 report.has_header= txt1 != "false"
102 elif txt in ("extract_fields","extractfields"):
103 report.extract_fields= txt1=="true"
104 elif txt =="delim":
105 if len(txt1)==0:
106 raise ValueError("empty delim specified")
107 report.delim= txt1
108 report.delim_char= recfuncs.delim_get_char(report.delim)
109 elif txt=="maxuv":
110 report.maxuv= numfuncs.is_int_get(txt1,"number")
111 if 1> report.maxuv > 1000:
112 report.maxuv=50
113
114 if report.extract_fields and not report.has_header:
115 raise ValueError("extract_fields is True which requires a header line but has_header is False")
116
117 if (fields is None or len(fields)==0) and not report.extract_fields:
118 raise ValueError("no fields")
119
120 if report.delim_char.startswith("false") or len(report.delim_char)==0:
121 raise ValueError("no delim in settings")
122
123 for i in range(len(fields)):
124 if isinstance(fields[i], field.Field):
125 txt =fields[i].title.lower().strip()
126 if len(txt)==0:
127 raise ValueError("empty field title at index " + str(i))
128 if txt in hash_srcfields:
129 raise ValueError("duplicate field:" + txt)
130 hash_srcfields[txt]=i
131 txt= fields[i].datatype.strip()
132 if txt.startswith("int"):
133 txt1="int"
134 elif txt in ["real","float"]:
135 txt1="real"
136 elif txt.startswith("date"):
137 txt1="date"
138 elif txt.startswith("bool"):
139 txt1="bool"
140 elif len(txt)>0:
141 txt1="string"
142 fields[i].datatype=txt1 # normalize
143 else:
144 raise ValueError("source field is not a Field object at index=" + str(i))
145
146 if not report.extract_fields:
147 for i in range(len(fields)):
148 if isinstance(fields[i], field.Field):
149 txt=fields[i].title.lower().strip()
150 if txt in report.hash_fields:
151 raise ValueError("duplicate field:" + txt)
152 report.fields.append(field.Field(fields[i].title.strip()))
153 n1= len(report.fields)-1
154 report.hash_fields[txt]= n1
155 report.field_names_lower.append(txt)
156 report.fields[n1].datatype= fields[i].datatype
157 report.fields[n1].fmt_strcase= fields[i].fmt_strcase
158 report.fields[n1].fmt_strlen= fields[i].fmt_strlen
159 report.fields[n1].fmt_strcut= fields[i].fmt_strcut
160 report.fields[n1].fmt_strpad= fields[i].fmt_strpad
161 report.fields[n1].fmt_strpadchar= fields[i].fmt_strpadchar
162 report.fields[n1].fmt_decimal= fields[i].fmt_decimal
163 report.fields[n1].fmt_date= fields[i].fmt_date
164 report.fields[n1].mapto= fields[i].mapto
165
166 report.field_datatype_dist.append({'int':0,'real':0,'date':0,'bool':0,'string':0,'empty':0})
167 report.field_uniqvals.append({})
168 report.spec_char_dist_field.append({})
169 report.field_quality.append("")
170 else:
171 raise ValueError("field entry is not a Field object at index=" + str(i))
172
173 for i in range(len(covalues)):
174 if "," in covalues[i]:
175 n1= covalues[i].find(",")
176 txt=covalues[i][:n1].lower().strip()
177 txt1=covalues[i][(n1+1):].lower().strip()
178 txt2=""
179 n1=txt1.find(",")
180 if n1>0:
181 txt2=txt1[(n1+1):].strip()
182 txt1=txt1[:n1].strip()
183 nflds=3
184 else:
185 nflds=2
186
187 fldname= txt + "," + txt1
188 if nflds==3:
189 fldname += "," + txt2
190
191 if fldname not in hash_temp:
192 if txt not in report.hash_fields:
193 raise ValueError("covalue field " + txt + "is not in record fields")
194 nfld1= report.hash_fields[txt]
195 if txt1 not in report.hash_fields:
196 raise ValueError("covalue field " + txt1 + "is not in record fields")
197 nfld2= report.hash_fields[txt1]
198 if nflds==3:
199 if txt2 not in report.hash_fields:
200 raise ValueError("covalue field " + txt2 + "is not in record fields")
201 nfld3= report.hash_fields[txt2]
202
203 report.covalues.append(field.CoValue(fldname))
204 n1= len(report.covalues)-1
205 report.covalues[n1].field1=txt
206 report.covalues[n1].field2=txt1
207 report.covalues[n1].field3=txt2
208 report.covalues[n1].field1_index= nfld1
209 report.covalues[n1].field2_index= nfld2
210 report.covalues[n1].field3_index= nfld3
211 report.covalues[n1].numfields= nflds
212
213 report.covalue_uniqvals.append({})
214 hash_temp[fldname]= len(report.covalues)-1
215
216 nrec=0
217 for iloop in range(len(recs)):
218 rec_has_err=False
219 rec_has_dt_err=False
220 rec_has_fmt_err=False
221
222 linein=recs[iloop]
223 txt=linein.strip()
224 if len(txt)>0 and not txt.startswith("#") and not txt.startswith("//"):
225 nrec +=1
226 if report.is_quoted and DQ in linein:
227 field_values=recfuncs.split_quoted_line(linein, report.delim_char)
228 if len(field_values)>0 and field_values[0].startswith("notok:"):
229 raise RuntimeError("error splitting quoted string: " + field_values[0][6:])
230 else:
231 field_values=linein.split(report.delim_char)
232
233 if nrec==1 and report.extract_fields:
234 for i in range(len(field_values)):
235 txt=field_values[i].lower().strip()
236 if len(txt)==0:
237 raise ValueError("empty extracted field title at index " + str(i))
238 elif txt in report.hash_fields:
239 raise ValueError("duplicate extracted field:" + report.fields[i].title)
240 report.field_names_lower.append(txt)
241 report.fields.append(field.Field(field_values[i].strip()))
242 report.hash_fields[txt]=i
243 if (txt in hash_srcfields):
244 n1=hash_srcfields[txt]
245 report.fields[i].datatype= fields[n1].datatype
246 report.fields[i].fmt_strcase= fields[n1].fmt_strcase
247 report.fields[i].fmt_strlen= fields[n1].fmt_strlen
248 report.fields[i].fmt_strcut= fields[n1].fmt_strcut
249 report.fields[i].fmt_strpad= fields[n1].fmt_strpad
250 report.fields[i].fmt_strpadchar= fields[n1].fmt_strpadchar
251 report.fields[i].fmt_decimal= fields[n1].fmt_decimal
252 report.fields[i].fmt_date= fields[n1].fmt_date
253 report.fields[i].mapto= fields[n1].mapto
254
255 report.field_datatype_dist.append({'int':0,'real':0,'date':0,'bool':0,'string':0,'empty':0})
256 report.field_uniqvals.append({})
257 report.spec_char_dist_field.append({})
258 report.field_quality.append("")
259
260 if len(report.fields)==0:
261 raise ValueError("no fields from extracted header (delim=" + report.delim_char + ") line:" + linein)
262
263 for i in range(len(covalues)):
264 if "," in covalues[i]:
265 n1= covalues[i].find(",")
266 txt=covalues[i][:n1].lower().strip()
267 txt1=covalues[i][(n1+1):].lower().strip()
268 txt2=""
269 n1=txt1.find(",")
270 if n1>0:
271 txt2=txt1[(n1+1):].strip()
272 txt1=txt1[:n1].strip()
273 nflds=3
274 else:
275 nflds=2
276
277 fldname= txt + "," + txt1
278 if nflds==3:
279 fldname += "," + txt2
280
281 if fldname not in hash_temp:
282 if txt not in report.hash_fields:
283 raise ValueError("covalue field " + txt + " is not in record fields")
284 nfld1= report.hash_fields[txt]
285 if txt1 not in report.hash_fields:
286 raise ValueError("covalue field " + txt1 + " is not in record fields")
287 nfld2= report.hash_fields[txt1]
288 if nflds==3:
289 if txt2 not in report.hash_fields:
290 raise ValueError("covalue field " + txt2 + " is not in record fields")
291 nfld3= report.hash_fields[txt2]
292
293 report.covalues.append(field.CoValue(fldname))
294 n1= len(report.covalues)-1
295 report.covalues[n1].field1=txt
296 report.covalues[n1].field2=txt1
297 report.covalues[n1].field3=txt2
298 report.covalues[n1].field1_index= nfld1
299 report.covalues[n1].field2_index= nfld2
300 report.covalues[n1].field3_index= nfld3
301 report.covalues[n1].numfields= nflds
302
303 report.covalue_uniqvals.append({})
304 hash_temp[fldname]= len(report.covalues)-1
305 elif nrec>1 or not report.has_header:
306 txt=str(len(linein))
307 if txt in report.rec_size_dist:
308 report.rec_size_dist[txt] +=1
309 elif len(report.rec_size_dist)<100:
310 report.rec_size_dist[txt] =1
311 report.numrecs += 1
312
313 #check for parsing errors with number of parsed fields
314 txt= detect_parsing_error(linein, field_values, report, nrec)
315 if txt.startswith("notok:"):
316 raise ValueError("error detecting parsing: " + txt[6:])
317 if "rec_has_err=true" in txt:
318 rec_has_err=True
319
320 # go through fields for datatypes, formats, unique values
321 txt = qc_fields(linein, field_values, report, nrec)
322 if txt.startswith("notok:"):
323 raise ValueError("error doing qc: " + txt[6:])
324 if "rec_has_fmt_err=true" in txt:
325 rec_has_fmt_err=True
326 if "rec_has_dt_err=true" in txt:
327 rec_has_dt_err=True
328
329 if rec_has_dt_err:
330 rec_has_err=True
331 report.err_stats["numrecs_err_datatype"] += 1
332 if rec_has_fmt_err:
333 rec_has_err=True
334 report.err_stats["numrecs_err_fmt"] += 1
335 if rec_has_err:
336 report.err_stats["numrecs_err"] += 1
337
338 # get covalue data
339 for jloop in range(len(report.covalues)):
340 nfld1=report.covalues[jloop].field1_index
341 nfld2=report.covalues[jloop].field2_index
342 nfld3=report.covalues[jloop].field3_index
343 fldval=""
344 fldval2=""
345 fldval3=""
346 if nfld1<len(field_values):
347 fldval= field_values[nfld1]
348 if nfld2<len(field_values):
349 fldval2= field_values[nfld2]
350 if 0<=nfld3<len(field_values):
351 fldval3= field_values[nfld3]
352
353 txt= fldval + "_" + fldval2
354 if nfld3>-1:
355 txt += "_" + fldval3
356 if not report.is_case_sens:
357 txt=txt.lower()
358 if txt not in report.covalue_uniqvals[jloop] and len(report.covalue_uniqvals[jloop])>=report.maxuv:
359 txt="-other-"
360 if txt not in report.covalue_uniqvals[jloop]:
361 report.covalue_uniqvals[jloop][txt]=0
362 report.covalue_uniqvals[jloop][txt] +=1
363
364 # sort uniquevalues and change to list instead of dict
365 for i in range(len(fields)):
366 report.field_uniqvals[i]=sorted(report.field_uniqvals[i].items(), key=lambda x:x[1], reverse=True)
367
368 for i in range(len(report.covalues)):
369 report.covalue_uniqvals[i]=sorted(report.covalue_uniqvals[i].items(), key=lambda x:x[1], reverse=True)
370
371 # compute field quality
372 for i in range(len(fields)):
373 tempint={"totalinst":0,"dterr":0,"dtdist":0,"fmterr":0,"fmtstrcase":0,"fmtstrlen":0,"fmtdate":0,"fmtdec":0,"spchar":0}
374 tempbool={"totalinst":False,"dterr":False,"dtdist":False,"fmterr":False,"fmtstrcase":False,"fmtstrlen":False,"fmtdate":False,"fmtdec":False,"spchar":False}
375 fldname= fields[i].title.lower()
376 if fldname in report.err_stats["fields_err_datatype"]:
377 tempint["dterr"]= report.err_stats["fields_err_datatype"][fldname]["count"]
378 tempbool["dterr"]= True
379 if fldname in report.err_stats["fields_err_fmt"]:
380 tempint["fmterr"]= report.err_stats["fields_err_fmt"][fldname]["count"]
381 tempbool["fmterr"]= True
382 for reason in report.err_stats["fields_err_fmt"][fldname]["reasons"]:
383 if fields[i].datatype=="string":
384 if "uppercase" in reason or "lowercase" in reason:
385 tempint["fmtstrcase"] += report.err_stats["fields_err_fmt"][fldname]["reasons"][reason]
386 tempbool["fmtstrcase"] = True
387 elif "length" in reason:
388 tempint["fmtstrlen"] += report.err_stats["fields_err_fmt"][fldname]["reasons"][reason]
389 tempbool["fmtstrlen"] = True
390 elif fields[i].datatype=="real":
391 if "decimal" in reason:
392 tempint["fmtdec"] += report.err_stats["fields_err_fmt"][fldname]["reasons"][reason]
393 tempbool["fmtdec"] = True
394 elif fields[i].datatype=="date":
395 if "format" in reason:
396 tempint["fmtdate"] += report.err_stats["fields_err_fmt"][fldname]["reasons"][reason]
397 tempbool["fmtdate"] = True
398 for dtyp in report.field_datatype_dist[i]:
399 if dtyp!= fields[i].datatype:
400 tempint["dtdist"] += report.field_datatype_dist[i][dtyp]
401 tempbool["dtdist"] = True
402 for sc in report.spec_char_dist_field[i]:
403 tempint["spchar"] += report.spec_char_dist_field[i][sc]
404 tempbool["spchar"]= True
405 for j in range(len(report.field_uniqvals[i])):
406 tempint["totalinst"] += report.field_uniqvals[i][j][1]
407 tempbool["totalinst"]= True
408 if tempint["totalinst"]>0:
409 f1=100
410 if tempint["dterr"]>0 and tempint["dtdist"]>0:
411 if tempint["dterr"]> tempint["dtdist"]:
412 f1 = 100- round(100*tempint["dterr"]/tempint["totalinst"],1)
413 else:
414 f1 = 100- round(100*tempint["dtdist"]/tempint["totalinst"],1)
415 elif tempint["dterr"]>0:
416 f1 = 100- round(100*tempint["dterr"]/tempint["totalinst"],1)
417 elif tempint["dtdist"]>0:
418 f1 = 100- round(100*tempint["dtdist"]/tempint["totalinst"],1)
419 if f1>100:
420 f1=100
421 elif f1<0:
422 f1=0
423 # f1 is base goodness factor. next we reduce it when other issues exist based on datatype
424 if fields[i].datatype=="real":
425 if tempbool["fmtdec"] and tempint["fmtdec"]>0:
426 f2= 100- round(100*tempint["fmtdec"]/tempint["totalinst"],1)
427 if f2<= 50:
428 f1 /= 2
429 elif f2<= 75:
430 f1 /= 1.5
431 elif f2<= 85:
432 f1 /= 1.3
433 elif f2<= 95:
434 f1 /= 1.1
435 elif f2< 99.9:
436 f1 /= 1.05
437 elif fields[i].datatype=="date":
438 if tempbool["fmtdate"] and tempint["fmtdate"]>0:
439 f2= 100- round(100*tempint["fmtdate"]/tempint["totalinst"],1)
440 if f2<= 50:
441 f1 /= 2
442 elif f2<= 75:
443 f1 /= 1.5
444 elif f2<= 85:
445 f1 /= 1.3
446 elif f2<= 95:
447 f1 /= 1.1
448 elif f2< 99.9:
449 f1 /= 1.05
450 elif fields[i].datatype=="string":
451 f1=100 # reset since strings can have any datatype per value
452 if tempbool["fmtstrcase"] and tempint["fmtstrcase"]>0:
453 f2= 100- round(100*tempint["fmtstrcase"]/tempint["totalinst"],1)
454 if f2<= 50:
455 f1 /= 2
456 elif f2<= 75:
457 f1 /= 1.5
458 elif f2<= 85:
459 f1 /= 1.3
460 elif f2<= 95:
461 f1 /= 1.1
462 elif f2< 99.9:
463 f1 /= 1.05
464 if tempbool["fmtstrlen"] and tempint["fmtstrlen"]>0:
465 f2= 100- round(100*tempint["fmtstrlen"]/tempint["totalinst"],1)
466 if f2<= 50:
467 f1 /= 2
468 elif f2<= 75:
469 f1 /= 1.5
470 elif f2<= 85:
471 f1 /= 1.3
472 elif f2<= 95:
473 f1 /= 1.1
474 elif f2< 99.9:
475 f1 /= 1.05
476
477 if tempbool["spchar"] and tempint["spchar"]>0:
478 f2= 100- round(100*tempint["spchar"]/tempint["totalinst"],1)
479 if f2<= 75:
480 f1 *= .85
481 elif f2<= 90:
482 f1 *= .90
483 elif f2< 99:
484 f1 *= .95
485 else:
486 f1 *= .99
487
488 f1 = round(f1,1)
489 report.field_quality[i]= str(f1)
490
491 except (RuntimeError, OSError, ValueError, copy.Error) as err:
492 report.status="notok:" + str(err)
493 return report
494
495
496def detect_parsing_error(linein:str, field_values:list, report:qualityanalysis.QualityAnalysis, nrec:int) -> str:
497 """
498 Assess parsed record values in 'field_values' list relative to number fields
499 and collect distribution and note errors in report which is updated here
500
501 linein: original record line before parsing
502 field_values: parsed field values in list
503 report: QualityAnalysis object passed by reference so is changed in this function. Inbound
504 must have fields property as list of field titles. Results are added to this object's
505 rec_parse_dist[], rec_parse_errs[x] x= ('small1', 'small2', 'big', 'small1_recs', 'small2_recs', 'big_recs')
506 with _recs being example lines stored as (nline)linein
507 nrec: integer current record number
508
509 Returns: string empty if no problems, notok:message if error, (rec_has_err=true) if parsing error
510 """
511
512
513 n1:int=0
514 msg:str=""
515 txt:str=""
516 rec_has_err:bool=False
517 try:
518 n1=len(field_values)
519 txt=str(n1)
520 if txt not in report.rec_parse_dist:
521 report.rec_parse_dist[txt]=0
522 report.rec_parse_dist[txt] += 1
523 if n1<len(report.fields):
524 rec_has_err=True
525 if n1+1==len(report.fields):
526 report.rec_parse_errs["small1"] += 1
527 if len(report.rec_parse_errs["small1_recs"])<50:
528 report.rec_parse_errs["small1_recs"].append(f"({nrec}){linein}")
529 else:
530 report.rec_parse_errs["small2"] += 1
531 if len(report.rec_parse_errs["small2_recs"])<50:
532 report.rec_parse_errs["small2_recs"].append(f"({nrec}){linein}")
533 elif n1>len(report.fields):
534 rec_has_err=True
535 report.rec_parse_errs["big"] += 1
536 if len(report.rec_parse_errs["big_recs"])<50:
537 report.rec_parse_errs["big_recs"].append(f"({nrec}){linein}")
538 if rec_has_err:
539 msg="(rec_has_err=true)"
540 except (RuntimeError, OSError, ValueError, copy.Error) as err:
541 msg="notok:" + str(err)
542 return msg
543
544
545def qc_fields(linein:str, field_values:list, report:qualityanalysis.QualityAnalysis, nrec:int) -> str:
546 """
547 Do Quality Control analysis of field values. Report is modified with assessments
548 for datatypes, formats, unique values.
549
550 linein: original record line before parsing
551 field_values: parsed field values in list
552 report: QualityAnalysis object passed by reference so is changed in this function. Inbound
553 must have fields property as list of field titles. Results are added to this object's
554 field_uniqvals[], fields[], err_stats{}, field_datatype_dist[],
555 spec_char_dist[], field_quality[], spec_char_examples[] with latter stored as
556 (nline)[comma delimited field:spchar pairs found in this record]linein with nline being the line number read
557 (excluding empty and comments lines) and is therefore 1 larger than the line's index
558 in the Python list (i.e. nline is 1 based while lists are 0-based).
559 nrec: integer current record number
560
561 Returns: string empty if no problems, notok:message if error,
562 possibly (rec_has_fmt_err=true) and/or (rec_has_dt_err=true)
563 """
564
565 txt:str=""
566 msg:str=""
567 fldval:str=""
568 fldname:str=""
569 fld_has_fmt_err:bool=False
570 rec_has_dt_err:bool=False
571 rec_has_fmt_err:bool=False
572 is_fld_empty:bool=False
573 spec_chars:list=[]
574 err_dt_flds:list=[]
575 err_fmt_flds:list=[]
576 try:
577 for jloop in range(len(report.fields)):
578 fld_has_fmt_err=False
579 is_fld_empty=False
580 fldval=""
581 fldname= report.field_names_lower[jloop]
582 if jloop< len(field_values):
583 fldval= field_values[jloop]
584 if len(fldval)==0:
585 txt="-empty-"
586 is_fld_empty=True
587 elif not report.is_case_sens:
588 txt=fldval.lower()
589 else:
590 txt=fldval
591
592 # unique value counts
593 if txt not in report.field_uniqvals[jloop] and len(report.field_uniqvals[jloop])>= report.maxuv:
594 txt="-other-"
595 if txt not in report.field_uniqvals[jloop]:
596 report.field_uniqvals[jloop][txt]=0
597 report.field_uniqvals[jloop][txt] += 1
598
599 # special chars
600 if len(fldval)>0:
601 for kloop in range(len(fldval)):
602 txt=""
603 n1=ord(fldval[kloop:kloop+1])
604 if n1==9:
605 txt="tab"
606 elif n1==33:
607 txt="!"
608 elif n1==34:
609 txt="doublequote"
610 elif n1==35:
611 txt="#"
612 elif n1==60:
613 txt="<"
614 elif n1==62:
615 txt=">"
616 elif n1==91:
617 txt="["
618 elif n1==92:
619 txt="backslash"
620 elif n1==93:
621 txt="]"
622 elif n1==94:
623 txt="^"
624 elif n1==123:
625 txt="{"
626 elif n1==125:
627 txt="}"
628 elif n1==126:
629 txt="~"
630 elif n1<=31 or 127 <= n1 <=255:
631 txt= "ascii_" + str(n1)
632 elif 256 <= n1 <= 65535:
633 txt= "unicode_" + str(n1)
634
635 if len(txt)>0:
636 spec_chars.append(report.fields[jloop].title +":"+txt)
637 if txt not in report.spec_char_dist:
638 report.spec_char_dist[txt]=0
639 report.spec_char_dist[txt] += 1
640 if txt not in report.spec_char_dist_field[jloop]:
641 report.spec_char_dist_field[jloop][txt]=0
642 report.spec_char_dist_field[jloop][txt] += 1
643
644 if is_fld_empty:
645 report.field_datatype_dist[jloop]["empty"] += 1
646
647 # check for datatype errors
648 if len(report.fields[jloop].datatype)>0:
649 txt=""
650 if report.fields[jloop].datatype=="int":
651 if is_fld_empty:
652 txt="false:empty"
653 else:
654 txt=numfuncs.is_int_get(fldval,"string")
655 elif report.fields[jloop].datatype=="real":
656 if is_fld_empty:
657 txt="false:empty"
658 else:
659 txt=numfuncs.is_real_get(fldval,"string")
660 elif report.fields[jloop].datatype=="bool":
661 if is_fld_empty:
662 txt="false:empty"
663 elif fldval.lower() not in ["true","false"]:
664 txt="false:not true/false"
665 if txt.startswith("false"):
666 reason=""
667 if txt.startswith("false:"):
668 reason= txt[txt.find(":")+1:]
669 rec_has_dt_err=True
670 if fldname not in report.err_stats["fields_err_datatype"]:
671 report.err_stats["fields_err_datatype"][fldname]={'count':0,'reasons':{}}
672 report.err_stats["fields_err_datatype"][fldname]["count"] += 1
673 if len(reason)>0:
674 if reason not in report.err_stats["fields_err_datatype"][fldname]["reasons"]:
675 report.err_stats["fields_err_datatype"][fldname]["reasons"][reason]=0
676 report.err_stats["fields_err_datatype"][fldname]["reasons"][reason] += 1
677 txt= fldval if not is_fld_empty else "-empty-"
678 err_dt_flds.append(f"[{fldname}:{reason}:{txt}]")
679 # detect datatype
680 if not is_fld_empty:
681 txt= recfuncs.detect_datatype(fldval)
682 if not txt.startswith("notok:") and len(txt)>0 and txt in report.field_datatype_dist[jloop]:
683 report.field_datatype_dist[jloop][txt] +=1
684 else:
685 if report.fields[jloop].datatype=="int":
686 report.field_datatype_dist[jloop]["int"] += 1
687 elif report.fields[jloop].datatype=="real":
688 report.field_datatype_dist[jloop]["real"] += 1
689 elif report.fields[jloop].datatype=="date":
690 report.field_datatype_dist[jloop]["date"] += 1
691 elif report.fields[jloop].datatype=="bool":
692 report.field_datatype_dist[jloop]["bool"] += 1
693 elif report.fields[jloop].datatype=="string":
694 report.field_datatype_dist[jloop]["string"] += 1
695 elif not is_fld_empty:
696 txt= recfuncs.detect_datatype(fldval)
697 if txt.startswith("notok:"):
698 raise ValueError(txt[6:])
699 if len(txt)==0:
700 raise ValueError("no datatype returned for fldval=" + fldval)
701 if txt in report.field_datatype_dist[jloop]:
702 report.field_datatype_dist[jloop][txt] +=1
703 else:
704 raise ValueError("datatype returned not known: " + txt + ", fldval=" + fldval)
705
706 # check format
707 if len(report.fields[jloop].datatype)>0:
708 reason=""
709 if report.fields[jloop].datatype=="string":
710 if report.fields[jloop].fmt_strcase=="upper" and not fldval.isupper():
711 fld_has_fmt_err=True
712 reason="string not uppercase"
713 elif report.fields[jloop].fmt_strcase=="lower" and not fldval.islower():
714 fld_has_fmt_err=True
715 reason="string not lowercase"
716
717 if report.fields[jloop].fmt_strlen>0 and len(fldval) != report.fields[jloop].fmt_strlen:
718 fld_has_fmt_err=True
719 reason="string incorrect length"
720 elif report.fields[jloop].datatype=="real":
721 if report.fields[jloop].fmt_decimal>=0:
722 txt=""
723 if "." in fldval:
724 txt= fldval[(fldval.find(".")+1):]
725 if len(txt) != report.fields[jloop].fmt_decimal:
726 fld_has_fmt_err=True
727 reason="real incorrect decimals"
728 elif report.fields[jloop].datatype=="date":
729 if len(report.fields[jloop].fmt_date)>0:
730 if len(fldval)==0:
731 fld_has_fmt_err=True
732 reason="date empty"
733 elif not datefuncs.is_date_format(fldval, report.fields[jloop].fmt_date):
734 fld_has_fmt_err=True
735 reason="date incorrect format"
736 if fld_has_fmt_err:
737 rec_has_fmt_err=True
738 if fldname not in report.err_stats["fields_err_fmt"]:
739 report.err_stats["fields_err_fmt"][fldname]={'count':0,'reasons':{}}
740 report.err_stats["fields_err_fmt"][fldname]["count"] += 1
741 if len(reason)>0:
742 if reason not in report.err_stats["fields_err_fmt"][fldname]["reasons"]:
743 report.err_stats["fields_err_fmt"][fldname]["reasons"][reason]=0
744 report.err_stats["fields_err_fmt"][fldname]["reasons"][reason] += 1
745 txt= fldval if not is_fld_empty else "-empty-"
746 err_fmt_flds.append(f"[{fldname}:{reason}:{txt}]")
747
748 if len(spec_chars)>0 and len(report.spec_char_examples)<50:
749 txt="["
750 for i in range(len(spec_chars)):
751 if i>0:
752 txt += ","
753 txt += spec_chars[i]
754 txt += "]" + linein
755 report.spec_char_examples.append(f"({nrec}){txt}")
756
757 msg=""
758 if rec_has_fmt_err:
759 msg += "(rec_has_fmt_err=true)"
760 if len(report.err_fmt_examples)<50 and len(err_fmt_flds)>0:
761 report.err_fmt_examples.append(f"({nrec}){'|'.join(err_fmt_flds)}")
762 if rec_has_dt_err:
763 msg += "(rec_has_dt_err=true)"
764 if len(report.err_datatype_examples)<50 and len(err_dt_flds)>0:
765 report.err_datatype_examples.append(f"({nrec}){'|'.join(err_dt_flds)}")
766
767 except (RuntimeError, OSError, ValueError, copy.Error) as err:
768 msg="notok:" + str(err)
769 return msg
str qc_fields(str linein, list field_values, qualityanalysis.QualityAnalysis report, int nrec)
str detect_parsing_error(str linein, list field_values, qualityanalysis.QualityAnalysis report, int nrec)
qualityanalysis.QualityAnalysis do_qualityinspect(list fields=None, list covalues=None, list recs=None, dict settings=None)