VerityPy 1.1
Python library for Verity data profiling, quality control, remediation
qualityanalysis.py
Go to the documentation of this file.
1"""
2Quality Analysis object and functions
3"""
4
5__all__ = ['QualityAnalysis']
6__version__ = '1.0'
7__author__ = 'Geoffrey Malafsky'
8__email__ = 'gmalafsky@technikinterlytics.com'
9__date__ = '20240810'
10
11from . import recfuncs, field
12
13
14DQ:str= "\""
15LF:str="\n"
16
17
19 """
20 QualityAnalysis
21
22 Object containing settings and results of quality inspection.
23
24 title: name for this object usually the name of the job run to make it
25 status: system assigned value which will be notok:reason if there is an error
26 debug: empty, info, trace
27 delim: name of delimiter to parse source records (comma, tab, pipe, colon, caret). Default comma
28 delim_char: character for delim used in code. Default ,
29 maxuv: maximum number of unique values to collect per field with remainder into category '-other-'. Default=50
30 is_case_sens: bool whether values are case sensitive. Default=False
31 is_quoted: bool whether source records can have quoted values. Default= False
32 has_header: bool whether source records have header line as first non-empty, non-comment line. Default= True
33 extract_fields: bool whether fields should be extracted from header line instead of supplied in 'fields' list. Default= False
34 fields: list of field objects which have attributes for title, datatype, and formatting
35 field_names_lower: list of field titles in lower case
36 hash_fields: dictionary key= field title lower case with value= list index
37 numrecs: integer count of records used
38
39 field_datatype_dist: list of field datatype distributions correlated to fields list.
40 Each field has counts for detected datatypes (int, real, bool, date, string, empty).
41 field_uniqvalues: list correlated to fields. Each entry is a descending sorted list of
42 uniquevalue tuples with each tuple (uv,count) where uv= string of unique value
43 and count= integer number of instances. A maximum number of values are kept (default=50) with additional
44 grouped into -other-
45 field_quality: list correlated to fields. String of an integer 0-100 as a quality metric computed from
46 discovered field characteristics
47 rec_size_dist: dictionary of record sizes (byte lengths) to counts. Max 100 sizes.
48 rec_parse_errs: dictionary of parsing errors (number fields after parsing
49 relative to defined fields) by type as small1 (1 too few fields), small2
50 (2 or more missing fields), big (1 or more too many fields). Also, has
51 keys for lists of example records small1_recs, small2_recs,
52 big_recs (each max 50).
53 rec_parse_dist: dictionary of number of parsed fields to count
54 spec_char_dist: dictionary of special characters and their counts. Special
55 characters are (some use aliases as dictionary keys):
56 tab, !, doublequote, #, <, >, [, ], backslash, ^, {, }, ~, ascii_[0-31, 127-255],
57 unicode_[256-65535]
58 spec_char_dist_field: list correlated to fields[] with each being a dictionary
59 of special characters to their counts for that specific field. Same organization
60 as in spec_char_dist
61 spec_char_examples: list of examples of discovered special characters. Each entry is
62 (nline)[sp char list]record with nline being the number line read from source data
63 (excluding empty and comments lines) and is therefore 1 larger than the line's index
64 in the Python list (i.e. nline is 1 based while lists are 0-based).
65 [sp char list] comma delimited string of each special character
66 found in the record such as [spchar1,spchar2]lineIn. A single field can have
67 more than 1 special character. For example, input line (pipe delimited) as record line
68 #5 (although actual file line number could be larger due to comments and empty lines) and data =
69 !dog|{House}|123^456 will be stored as an example as
70 (5)[!,{,},^]!dog|{House}|123^456
71 covalues: list of CoValue objects to collect unique value information.
72 covalue_uniqvalues: correlated to covalues array. Similar to field unique values.
73 err_stats: dictionary of properties and counts.
74 numrecs_err: number records with any kind of error
75 numrecs_err_datatype: number records with datatype error
76 numrecs_err_fmt: number records with format error
77 fields_err_datatype: dictionary of fields with datatype errors and counts
78 fields_err_fmt: dictionary of fields with format errors and counts
79 err_datatype_examples: list of delimited fields within records with datatype errors.
80 Syntax is: (nline)[fieldinfo]|[fieldinfo]..... where [fieldinfo] is
81 fieldTitle:reason:fieldValue . fieldValue will be set to -empty- if the actual value is
82 empty. nline is the number line read from source data
83 (excluding empty and comments lines) and is therefore 1 larger than the line's index
84 in the Python list (i.e. nline is 1 based while lists are 0-based).
85 err_fmt_examples: list of delimited fields within records with format errors.
86 Syntax is: (nline)[fieldinfo]|[fieldinfo]..... where [fieldinfo] is
87 fieldTitle:reason:fieldValue . fieldValue will be set to -empty- if the actual value is
88 empty. nline is the number line read from source data
89 (excluding empty and comments lines) and is therefore 1 larger than the line's index
90 in the Python list (i.e. nline is 1 based while lists are 0-based).
91 """
92
93 def __init__(self):
94 self.title=""
95 self.status=""
96 self.debug=""
97 self.delim="comma"
98 self.delim_char=","
99 self.maxuv=50
100 self.is_case_sens=False
101 self.is_quoted=False
102 self.has_header=True
103 self.extract_fields=False
104 self.fields=[]
112 'small1':0,
113 'small2':0,
114 'big':0,
115 'small1_recs':[],
116 'small2_recs':[],
117 'big_recs':[],
118 }
123 self.covalues=[]
125 self.numrecs=0
127 'numrecs_err':0,
128 'numrecs_err_datatype':0,
129 'numrecs_err_fmt':0,
130 'fields_err_datatype':{},
131 'fields_err_fmt':{},
132 }
135
136
137 def get_json(self,add_lf:bool=False):
138 """
139 Constructs array of json strings for components of this object.
140 add_lf: if True then line feed added at end of each entry. This is
141 unnecessary if returned array is printed as line per entry.
142 Returns string list with first entry starting with notok: if error
143 """
144
145 outline:str=""
146 outarray:list=[]
147 c:str=","
148 qcom:str= DQ + c
149 qcq:str= DQ + ":" + DQ
150 qc:str= DQ + ":"
151 lb:str="{"
152 lbq:str= lb + DQ
153 clb:str= c + lb
154 clbq:str= c + lb + DQ
155 rb:str="}"
156 qrb:str= DQ + rb
157 ls:str="["
158 rs:str="]"
159 n1:int=-1
160 n2:int=-1
161 txt:str=""
162 try:
163 outarray.append(lbq+"report" + qc + ls)
164 outarray.append(lbq+"title"+qcq+self.title+qrb)
165 outarray.append(clbq+"status"+qcq+self.status+qrb)
166 outarray.append(clbq+"debug"+qcq+self.debug+qrb)
167 outarray.append(clbq+"numrecs"+qcq+str(self.numrecs)+qrb)
168 outarray.append(clbq+"maxuv"+qcq+str(self.maxuv)+qrb)
169 outarray.append(clbq+"delim"+qcq+self.delim+qrb)
170 outarray.append(clbq+"delim_char"+qcq+self.delim_char+qrb)
171 outarray.append(clbq+"is_case_sens"+qcq+str(self.is_case_sens).lower()+qrb)
172 outarray.append(clbq+"is_quoted"+qcq+str(self.is_quoted).lower()+qrb)
173 outarray.append(clbq+"has_header"+qcq+str(self.has_header).lower()+qrb)
174 outarray.append(clbq+"extract_fields"+qcq+str(self.extract_fields).lower()+qrb)
175
176 outarray.append(clbq+"fields"+qc+ls)
177 for i in range(len(self.fields)):
178 if isinstance(self.fields[i], field.Field):
179 outline=""
180 if i>0:
181 outline=c
182 outline += DQ + self.fields[i].title + DQ
183 outarray.append(outline)
184 outarray.append(rs)
185 outarray.append(rb)
186
187 outarray.append(clbq+"field_datatypes"+qc+ls)
188 for i in range(len(self.fields)):
189 if isinstance(self.fields[i], field.Field):
190 outline=""
191 if i>0:
192 outline=c
193 outline += DQ + self.fields[i].datatype + DQ
194 outarray.append(outline)
195 outarray.append(rs)
196 outarray.append(rb)
197
198 outarray.append(clbq+"field_formats"+qc+ls)
199 for i in range(len(self.fields)):
200 if isinstance(self.fields[i], field.Field):
201 outline=""
202 if i>1:
203 outline=c
204 outline += lb
205 outline+=DQ+"strcase"+qcq+ self.fields[i].fmt_strcase +DQ
206 outline+=c+DQ+"strlen"+qcq+str(self.fields[i].fmt_strlen)+DQ
207 outline+=c+DQ+"decimal"+qcq+str(self.fields[i].fmt_decimal)+DQ
208 outline+=c+DQ+"date"+qcq+self.fields[i].fmt_date+DQ
209 outline+=c+DQ+"strcut"+qcq+str(self.fields[i].fmt_strcut)+DQ
210 outline+=c+DQ+"strpad"+qcq+str(self.fields[i].fmt_strpad)+DQ
211 outline+=c+DQ+"strpadchar"+qcq+str(self.fields[i].fmt_strpadchar)+DQ
212 outline += rb
213 outarray.append(outline)
214 outarray.append(rs)
215 outarray.append(rb)
216
217 outarray.append(clbq+"field_quality"+qc+ls)
218 for i in range(len(self.fields)):
219 outline=""
220 if i>0:
221 outline=c
222 txt=""
223 if i< len(self.field_quality):
224 txt=self.field_quality[i]
225 outline += DQ + txt + DQ
226 outarray.append(outline)
227 outarray.append(rs)
228 outarray.append(rb)
229
230 outarray.append(clbq+"field_datatype_dists"+qc+ls)
231 for i in range(len(self.fields)):
232 if i>0:
233 outline= clb
234 else:
235 outline=lb
236 if i< len(self.field_datatype_dist):
237 n1=0
238 for k,v in self.field_datatype_dist[i].items():
239 n1 += 1
240 if n1>1:
241 outline += c
242 outline += DQ + k + qc + str(v)
243 outline += rb
244 outarray.append(outline)
245 outarray.append(rs)
246 outarray.append(rb)
247
248 outarray.append(clbq+"field_uniqvalues"+qc+ls)
249 n1=0
250 for i in range(len(self.fields)):
251 if isinstance(self.fields[i], field.Field) and len(self.field_uniqvals[i])>0:
252 n1 += 1
253 outline=""
254 if n1>1:
255 outline += c
256 outline += lbq + "field" + qcq + self.fields[i].title +qcom
257 outarray.append(outline)
258 outline= DQ + "uniqvalues" + qc + ls
259 outarray.append(outline)
260 for j in range(len(self.field_uniqvals[i])):
261 outline=""
262 if j>0:
263 outline=c
264 txt= recfuncs.extract_char_aliases(self.field_uniqvals[i][j][0],["|","/"])
265 outline += lbq + "uniqvalue" + qcq + txt + qcom + DQ + "count" + qc + str(self.field_uniqvals[i][j][1]) + rb
266 outarray.append(outline)
267 outarray.append(rs)
268 outarray.append(rb)
269 outarray.append(rs)
270 outarray.append(rb)
271
272 outarray.append(clbq+"covalues"+qc+ls)
273 for i in range(len(self.covalues)):
274 outline=""
275 if i>0:
276 outline=c
277 outline += self.covalues[i].get_json()
278 outarray.append(outline)
279 outarray.append(rs)
280 outarray.append(rb)
281
282 outarray.append(clbq+"covalue_uniqvalues"+qc+ls)
283 n1=0
284 for i in range(len(self.covalues)):
285 if len(self.covalue_uniqvals[i])>0:
286 n1 += 1
287 outline=""
288 if n1>1:
289 outline += c
290 outline += lbq + "covalue" + qcq + self.covalues[i].title + qcom
291 outarray.append(outline)
292 outline= DQ + "uniqvalues" + qc+ls
293 outarray.append(outline)
294 for j in range(len(self.covalue_uniqvals[i])):
295 outline=""
296 if j>0:
297 outline=c
298 txt= recfuncs.extract_char_aliases(self.covalue_uniqvals[i][j][0],["|","/"])
299 outline += lbq + "uniqvalue" + qcq + txt + qcom
300 outline += DQ + "count" + qc + str(self.covalue_uniqvals[i][j][1]) + rb
301 outarray.append(outline)
302 outarray.append(rs)
303 outarray.append(rb)
304 outarray.append(rs)
305 outarray.append(rb)
306
307 outarray.append(clbq+"field_spchar_dists"+qc+ls)
308 for i in range(len(self.fields)):
309 outline=""
310 if i>0:
311 outline += c
312 outline += lbq + "field" + qcq + self.fields[i].title +qc+ls
313 outarray.append(outline)
314 n1=0
315 if i< len(self.spec_char_dist_field):
316 for k,v in self.spec_char_dist_field[i].items():
317 outline=""
318 n1 += 1
319 if n1>1:
320 outline += c
321 outline += lbq + k + qc + str(v) + rb
322 outarray.append(outline)
323 outarray.append(rs)
324 outarray.append(rb)
325 outarray.append(rs)
326 outarray.append(rb)
327
328 outarray.append(clbq+"spec_char_dists"+qc+ls)
329 n1=0
330 for k,v in self.spec_char_dist.items():
331 n1 += 1
332 outline=""
333 if n1>1:
334 outline=c
335 outline += lbq + k + qc + str(v) + rb
336 outarray.append(outline)
337 outarray.append(rs)
338 outarray.append(rb)
339
340 outarray.append(clbq+"spec_char_examples"+qc+ls)
341 for i in range(len(self.spec_char_examples)):
342 outline=""
343 if i>0:
344 outline=c
345 txt=self.spec_char_examples[i]
346 txt1=""
347 if txt.find("[")>-1 and txt.find("]")>-1:
348 txt1=txt[:txt.find("]")+1]
349 txt=txt[txt.find("]")+1:]
350 txt= recfuncs.extract_char_aliases(txt,[",","|","/","(",")"])
351 if len(txt1)>0:
352 outline += lbq + "example" + qc + DQ + txt1 + DQ
353 outline += c+ DQ + "rec" + qcq + txt + qrb
354 outarray.append(outline)
355 outarray.append(rs)
356 outarray.append(rb)
357
358 # err_stats
359 outarray.append(clbq+"err_stats"+qc+ls)
360 outarray.append(lbq+"numrecs_err"+qc+str(self.err_stats["numrecs_err"])+rb)
361 outarray.append(clbq+"numrecs_err_datatype"+qc+str(self.err_stats["numrecs_err_datatype"])+rb)
362 outarray.append(clbq+"numrecs_err_fmt"+qc+str(self.err_stats["numrecs_err_fmt"])+rb)
363 outarray.append(clbq+"fields_err_datatype"+qc+ls)
364 n1=0
365 for f,b in self.err_stats["fields_err_datatype"].items():
366 n1 += 1
367 outline=""
368 if n1>1:
369 outline=c
370 outline += lbq + "field" + qcq + str(f) + qcom
371 outline += DQ + "count" + qc + str(b["count"]) + c
372 outarray.append(outline)
373 outarray.append(DQ+"reasons"+qc + ls)
374 n2=0
375 for r,n in b["reasons"].items():
376 n2 += 1
377 outline=""
378 if n2>1:
379 outline=c
380 outline += lbq + "reason" + qcq + str(r) + qcom
381 outline += DQ + "count" + qc + str(n) + rb
382 outarray.append(outline)
383 outarray.append(rs)
384 outarray.append(rb)
385 outarray.append(rs)
386 outarray.append(rb)
387
388 outarray.append(clbq+"fields_err_fmt"+qc+ls)
389 n1=0
390 for f,b in self.err_stats["fields_err_fmt"].items():
391 n1 += 1
392 outline=""
393 if n1>1:
394 outline=c
395 outline += lbq + "field" + qcq + str(f) + qcom
396 outline += DQ + "count" + qc + str(b["count"]) + c
397 outarray.append(outline)
398 outarray.append(DQ+"reasons"+qc + ls)
399 n2=0
400 for r,n in b["reasons"].items():
401 n2 += 1
402 outline=""
403 if n2>1:
404 outline=c
405 outline += lbq + "reason" + qcq + str(r) + qcom
406 outline += DQ + "count" + qc + str(n) + rb
407 outarray.append(outline)
408 outarray.append(rs)
409 outarray.append(rb)
410 outarray.append(rs)
411 outarray.append(rb)
412
413 outarray.append(rs)
414 outarray.append(rb)
415 # end err_stats
416
417 outarray.append(clbq+"err_datatype_examples"+qc+ls)
418 for i in range(len(self.err_datatype_examples)):
419 outline=""
420 if i>0:
421 outline=c
422 txt=self.err_datatype_examples[i]
423 txt1=""
424 if txt.startswith("("):
425 txt1=txt[1:txt.find(")")]
426 txt=txt[txt.find(")")+1:]
427 txt= recfuncs.extract_char_aliases(txt,[",","|","/","(",")","[","]"])
428 if len(txt1)>0:
429 outline += lbq + "nline" + qc + txt1
430 outline += c+ DQ + "rec" + qcq + txt + qrb
431 outarray.append(outline)
432 outarray.append(rs)
433 outarray.append(rb)
434
435 outarray.append(clbq+"err_fmt_examples"+qc+ls)
436 for i in range(len(self.err_fmt_examples)):
437 outline=""
438 if i>0:
439 outline=c
440 txt=self.err_fmt_examples[i]
441 txt1=""
442 if txt.startswith("("):
443 txt1=txt[1:txt.find(")")]
444 txt=txt[txt.find(")")+1:]
445 txt= recfuncs.extract_char_aliases(txt,[",","|","/","(",")","[","]"])
446 if len(txt1)>0:
447 outline += lbq + "nline" + qc + txt1
448 outline += c+ DQ + "rec" + qcq + txt + qrb
449 outarray.append(outline)
450 outarray.append(rs)
451 outarray.append(rb)
452
453 outarray.append(clbq+"rec_size_dist"+qc+ls)
454 n1=0
455 for k,v in self.rec_size_dist.items():
456 n1 += 1
457 outline=""
458 if n1>1:
459 outline=c
460 outline += lbq + k + qc + str(v) + rb
461 outarray.append(outline)
462 outarray.append(rs)
463 outarray.append(rb)
464
465 outarray.append(clbq+"rec_parse_dist"+qc+ls)
466 n1=0
467 for k,v in self.rec_parse_dist.items():
468 n1 += 1
469 outline=""
470 if n1>1:
471 outline=c
472 outline += lbq + k + qc + str(v) + rb
473 outarray.append(outline)
474 outarray.append(rs)
475 outarray.append(rb)
476
477 outarray.append(clbq+"rec_parse_errs"+qc+ls)
478 outarray.append(lbq + "small1" + qc + str(self.rec_parse_errs["small1"]) + rb)
479 outarray.append(clbq + "small2" + qc + str(self.rec_parse_errs["small2"]) + rb)
480 outarray.append(clbq + "big" + qc + str(self.rec_parse_errs["big"]) + rb)
481 outarray.append(rs)
482 outarray.append(rb)
483
484 outarray.append(clbq+"rec_parse_errs_examples"+qc+ls)
485 n1=0
486 if len(self.rec_parse_errs["small1_recs"])>0:
487 n1 += 1
488 outarray.append(lbq + "small1" + qc + ls)
489 for i in range(len(self.rec_parse_errs["small1_recs"])):
490 outline=""
491 if i>0:
492 outline=c
493 txt= recfuncs.extract_char_aliases(self.rec_parse_errs["small1_recs"][i],[","," ","|","/","(",")"])
494 outline += lbq + "record" + qcq + txt + qrb
495 outline += lbq+txt+qrb
496 outarray.append(outline)
497 outarray.append(rs)
498 outarray.append(rb)
499 if len(self.rec_parse_errs["small2_recs"])>0:
500 n1 += 1
501 txt=""
502 if n1>1:
503 txt=c
504 outarray.append(txt+lbq + "small2" + qc + ls)
505 for i in range(len(self.rec_parse_errs["small2_recs"])):
506 outline=""
507 if i>0:
508 outline=c
509 txt= recfuncs.extract_char_aliases(self.rec_parse_errs["small2_recs"][i],[","," ","|","/","(",")"])
510 outline += lbq + "record" + qcq + txt + qrb
511 outarray.append(outline)
512 outarray.append(rs)
513 outarray.append(rb)
514 if len(self.rec_parse_errs["big_recs"])>0:
515 n1 += 1
516 txt=""
517 if n1>1:
518 txt=c
519 outarray.append(txt+lbq + "big" + qc + ls)
520 for i in range(len(self.rec_parse_errs["big_recs"])):
521 outline=""
522 if i>0:
523 outline=c
524 txt= recfuncs.extract_char_aliases(self.rec_parse_errs["big_recs"][i],[","," ","|","/","(",")"])
525 outline += lbq + "record" + qcq + txt + qrb
526 outarray.append(outline)
527 outarray.append(rs)
528 outarray.append(rb)
529 outarray.append(rs)
530 outarray.append(rb)
531
532 # final closing
533 outarray.append(rs)
534 outarray.append(rb)
535
536 if add_lf:
537 for i in range(len(outarray)):
538 outarray[i] += LF
539 except (RuntimeError, OSError, ValueError) as err:
540 outarray.insert(0,"notok:" + str(err))
541 return outarray