VerityPy 1.1
Python library for Verity data profiling, quality control, remediation
refinedata.py
Go to the documentation of this file.
1#!/usr/bin/env python
2"""
3Refinery to process analysis and transforms for full data set
4"""
5
6
7__all__ = ['do_refine']
8__version__ = "1.0"
9__author__ ="Geoffrey Malafsky"
10__email__ = "gmalafsky@technikinterlytics.com"
11__date__ = "20240715"
12
13
14from ..transforms import transform, lookup
15from . import recfuncs, field, exectransform, numfuncs, datefuncs
16
17DQ:str= "\""
18LF:str="\n"
19
20
21def do_refine(outfields:list, transforms:list, settings:dict, lookup_dicts:list, srcfields:list, srcrecs:list) -> list:
22 """
23 Refines a set of data records to correct, normalize, and enrich
24
25 outfields-list of Field objects for output fields that will be assigned values in each output record.
26 As Field objects, each can specify rules for datatype and format that will be applied after transforms
27 (if defined for the field) or passing through original value (if no transform defined).
28 - title: field name
29 - datatype: int, real, bool, date, string. For date, there should be an entry specifying the format otherwise it is set to ISO yyyyMMdd
30 - fmt_strlen: integer number of characters (>0) if a fixed size is required. Ignored if < 0
31 - fmt_strcase: (upper, lower, empty)
32 - fmt_strcut: (front, back, empty). Side to cut characters from if it is larger than specified fmt_strlen. Default is back.
33 - fmt_strpad: (front, back, empty). Side to add characters to if it is smaller than specified fmt_strlen. Default is back.
34 - fmt_strpadchar: single character or character alias (-space-, -fslash-, -bslash-, -tab-). Character to add if needed to make too small string meet specified fmt_strlen. Default is _
35 - fmt_decimal: number of decimal digits (0-N). Ignored if < 0
36 - fmt_date:
37 + without time part- yyyymmdd, yymmdd, yyyyddmm, yyddmm, mmddyyyy, mmddyy, ddmmyyyy, ddmmyy,
38 + (mmm= month abbreviation like Jan) yyyymmmdd, yyyyddmmm, ddmmmyyyy, ddmmmyy
39 + (month= full month name like January) yyyymonthdd, yyyyddmonth, ddmonthyyyy, ddmonthyy
40 + with time part: suffix to above date formats as- Thhmmss, Thhmm, Thh like mmddyyyyThhmm for 11282024T1407 or 11/28/2024T14:07
41 + (S is space) Shhmmss, Shhmm, Shh like 11-28-2024 14:07
42 + with time zone: if time zone is required at end of time part add suffix Z like mmddyyyyThhmmZ 11282024T1407
43
44 transforms-list of transform objects
45
46 settings-dictionary with various settings. required:
47 delim- (delimiter for parsing records) as comma, tab, pipe, caret, hyphen
48 delim_out- optional specified delimiter for output records (default is to use delim) as comma, tab, pipe, caret, hyphen
49 is_quoted - bool whether some field values are enclosed in double quotes to allow delimiter within the field value. Default is True.
50 has_header - bool whether first used line is delimited field titles. Default is true
51 use_comment - bool whether to use comment lines (start with # or //) or ignore them. Default is False so they are ignored.
52 normalize- bool whether to apply datatype and format rules to output field value. Default is true.
53 If datatype is int or real and field value is not numeric then the value will be set to 0 for int and 0.00 for real.
54 If datatype is bool and field value is neither true nor false then value will be set to false.
55 If datatype is date and field value is not in date format then value will be set to empty string.
56 embed_delim- new character(s) for replacing delim when a field contains delim. Default is a space.
57
58 lookup_dicts-list of LookUpDict objects. These should be made from files or arrays prior to invoking this method
59
60 srcfields-list of field objects in order correlated to input records when parsed using delimiter specified in settings
61
62 srcrecs-list of strings each of which is one input record. Default is to ignore empty lines and those
63 beginning with # or // as comments. This can be overidden with the setting useComments.
64 If the setting hasHeader is True (which is default) then the first used line must be a delimited line of field titles.
65
66 RETURNS outrecs as list of Refined data records including header of delimited fields names. If error, 0th entry will start with notok:
67 """
68
69 txt:str=""
70 txt1:str=""
71 delim:str=""
72 delimchar:str=""
73 delimout:str=""
74 delimoutchar:str=""
75 linein:str=""
76 fldval:str=""
77 lineout:str=""
78 embed_delim:str=" "
79 hashdr:bool=True
80 usecomment:bool=False
81 isquoted:bool=True
82 iscomment:bool=False
83 normalize:bool=True
84 doparseerror:bool=False
85 lrec:int=0
86 lcmt:int=0
87 lemp:int=0
88 nf:int=-1
89 nt:int=-1
90 n1:int=-1
91 outrecs:list=[]
92 keys:list=[]
93 hash_lookups:dict={}
94 hash_srcfields:dict={}
95 hash_outfields:dict={}
96 hash_out_to_src:dict={}
97 hash_src_to_out:dict={}
98 hash_out_to_transform:dict={}
99 src_values:list=[]
100 try:
101 if outfields is None or not isinstance(outfields, list):
102 raise ValueError("no outfields")
103 if srcfields is None or not isinstance(srcfields, list):
104 raise ValueError("no srcfields")
105 if srcrecs is None or not isinstance(srcrecs, list):
106 raise ValueError("no srcrecs")
107 if settings is None or not isinstance(settings, dict):
108 settings={}
109 if lookup_dicts is None or not isinstance(lookup_dicts, list):
110 lookup_dicts=[]
111 if transforms is None or not isinstance(transforms, list):
112 transforms=[]
113 keys= list(settings)
114 for i in range(len(keys)):
115 txt=keys[i].lower()
116 txt1= str(settings[keys[i]]).lower()
117 if txt in ("isquoted","is_quoted"):
118 isquoted = txt1=="true"
119 elif txt in ("hasheader","has_header"):
120 hashdr = txt1=="true"
121 elif txt in ("use_comment","usecomment"):
122 usecomment = txt1=="true"
123 elif txt=="normalize":
124 normalize = txt1=="true"
125 elif txt=="delim":
126 delim=txt1
127 delimchar= recfuncs.delim_get_char(delim)
128 elif txt in ("delim_out","delimout"):
129 delimout=txt1
130 delimoutchar= recfuncs.delim_get_char(delimout)
131 elif txt in ("embed_delim","embeddelim"):
132 embed_delim= txt1 if len(txt1)>0 else " "
133
134 if len(delim)==0:
135 raise ValueError("no delim specified")
136 if len(delimchar)==0:
137 raise ValueError("unrecognized delim: " + delim)
138 if len(outfields)==0:
139 raise ValueError("no output fields specified")
140 if len(srcfields)==0:
141 raise ValueError("no source fields specified")
142 if len(srcrecs)==0:
143 raise ValueError("no data records supplied")
144
145 if len(delimout)==0 or len(delimoutchar)==0:
146 delimoutchar= delimchar
147 for i in range(len(lookup_dicts)):
148 if not isinstance(lookup_dicts[i], lookup.LookUpDict):
149 raise ValueError("lookup_dict " + str(i) + " is not a LookUpDict object")
150 txt= lookup_dicts[i].title.lower().strip()
151 if len(txt)==0:
152 raise ValueError("lookup_dict " + str(i) + " has empty title")
153 elif txt in hash_lookups:
154 raise ValueError("lookup_dict " + str(i) + " has duplicate title")
155 hash_lookups[txt]=i
156 for i in range(len(srcfields)):
157 if not isinstance(srcfields[i], field.Field):
158 raise ValueError("source field " + str(i) + " is not a Field object")
159 txt= srcfields[i].title.lower().strip()
160 if len(txt)==0:
161 raise ValueError("source field " + str(i) + " has empty title")
162 elif txt in hash_srcfields:
163 raise ValueError("source field " + str(i) + " has duplicate title")
164 hash_srcfields[txt]=i
165 txt= srcfields[i].parse_error_action.lower().strip()
166 if txt in ["-use-","-ignore-",""]:
167 srcfields[i].parse_error_action=txt
168 for i in range(len(outfields)):
169 if not isinstance(outfields[i], field.Field):
170 raise ValueError("output field " + str(i) + " is not a Field object")
171 txt= outfields[i].title.lower().strip()
172 if len(txt)==0:
173 raise ValueError("output field " + str(i) + " has empty title")
174 elif txt in hash_outfields:
175 raise ValueError("output field " + str(i) + " has duplicate title")
176 hash_outfields[txt]=i
177 txt1= outfields[i].mapto.lower().strip()
178 if txt in hash_srcfields:
179 hash_out_to_src[i]=hash_srcfields[txt]
180 hash_src_to_out[hash_srcfields[txt]]=i
181 elif txt1 in hash_srcfields:
182 hash_out_to_src[i]=hash_srcfields[txt1]
183 hash_src_to_out[hash_srcfields[txt1]]=i
184 else:
185 hash_out_to_src[i]=-1
186 txt= outfields[i].fmt_strcase.lower().strip()
187 outfields[i].fmt_strcase = txt if txt in ("upper","lower") else ""
188 txt= outfields[i].fmt_strcut.lower().strip()
189 outfields[i].fmt_strcut = "front" if txt in ("front","left","start") else "back"
190 txt= outfields[i].fmt_strpad.lower().strip()
191 outfields[i].fmt_strpad = "front" if txt in ("front","left","start") else "back"
192 txt= outfields[i].fmt_strpadchar.lower().strip()
193 if len(txt)==0:
194 outfields[i].fmt_strpadchar="_"
195 elif txt in ("space","-space-"):
196 outfields[i].fmt_strpadchar=" "
197 elif txt in ("fslash","-fslash-"):
198 outfields[i].fmt_strpadchar="/"
199 elif txt in ("bslash","-bslash-"):
200 outfields[i].fmt_strpadchar="\\"
201 elif txt in ("tab","-tab-"):
202 outfields[i].fmt_strpadchar="\t"
203 for i in range(len(transforms)):
204 if not isinstance(transforms[i], transform.Transform):
205 raise ValueError("transforms " + str(i) + " is not a Transform object")
206 txt= transforms[i].title.lower().strip()
207 if len(txt)==0:
208 raise ValueError("transforms " + str(i) + " has empty title")
209 if txt in hash_outfields:
210 hash_out_to_transform[hash_outfields[txt]]=i
211 elif txt in hash_srcfields:
212 if hash_srcfields[txt] in hash_src_to_out:
213 hash_out_to_transform[hash_src_to_out[hash_srcfields[txt]]]=i
214 for j in range(len(transforms[i].ops)):
215 if transforms[i].ops[j].title.lower().startswith("lookup"):
216 if transforms[i].ops[j].param1.lower() not in hash_lookups:
217 raise ValueError("unknown LookUpDict specified in transform " + str(i) + ", op " + str(j) + ": " + transforms[i].ops[j].param1)
218
219 lineout=""
220 for i in range(len(outfields)):
221 if i>0:
222 lineout += delimoutchar
223 lineout += outfields[i].title
224 outrecs.append(lineout)
225
226 for i in range(len(srcrecs)):
227 linein= srcrecs[i]
228 if len(linein.strip())>0:
229 txt= linein.lstrip()
230 iscomment = txt.startswith("#") or txt.startswith("//")
231 if iscomment:
232 lcmt += 1
233 if usecomment or not iscomment:
234 lrec += 1
235 if lrec>1 or not hashdr:
236 if isquoted and DQ in linein:
237 src_values= recfuncs.split_quoted_line(linein, delimchar)
238 if len(src_values)>0 and src_values[0].startswith("notok:"):
239 raise ValueError("error splitting quoted line #rec=" + str(lrec) + ": " + src_values[0][6:])
240 else:
241 src_values= linein.split(delimchar)
242
243 lineout=""
244 for ifld in range(len(outfields)):
245 fldval=""
246 nf=hash_out_to_src[ifld]
247 doparseerror=True
248 if nf>=0:
249 if nf< len(src_values):
250 fldval=src_values[nf]
251 elif srcfields[nf].parse_error_action=="-ignore-":
252 doparseerror= False
253 elif srcfields[nf].parse_error_action=="-use-" or len(srcfields[nf].parse_error_action)==0:
254 fldval="" # there is a parse error so we keep value empty and proceed to transforms and normalize
255 else:
256 fldval= srcfields[nf].parse_error_action # string value to assign
257 doparseerror= False
258 else:
259 fldval="" # no mapped source
260
261 if ifld in hash_out_to_transform:
262 if nf< len(src_values) or doparseerror:
263 nt= hash_out_to_transform[ifld]
264 fldval= exectransform.do_transform(transforms[nt], fldval, outfields[ifld].datatype,
265 hash_srcfields,
266 src_values,
267 lookup_dicts,
268 hash_lookups
269 )
270
271 if fldval.startswith("notok:"):
272 raise ValueError(fldval[6:])
273 # normalize if set to do so and either not a parsing error (no source field or has source value) or set to do parsing error
274 if normalize:
275 if nf<0 or nf< len(src_values) or doparseerror:
276 if outfields[ifld].datatype=="string":
277 if outfields[ifld].fmt_strlen>0:
278 n1= len(fldval)- outfields[ifld].fmt_strlen
279 if n1>0:
280 if outfields[ifld].fmt_strcut=="front":
281 fldval= fldval[n1:]
282 else:
283 fldval= fldval[:-n1]
284 elif n1<0:
285 if outfields[ifld].fmt_strpad=="front":
286 fldval= fldval.ljust(outfields[ifld].fmt_strlen,outfields[ifld].fmt_strpadchar)
287 else:
288 fldval= fldval.rjust(outfields[ifld].fmt_strlen,outfields[ifld].fmt_strpadchar)
289 if outfields[ifld].fmt_strcase=="upper":
290 fldval= fldval.upper()
291 elif outfields[ifld].fmt_strcase=="lower":
292 fldval= fldval.lower()
293 elif outfields[ifld].datatype=="int":
294 if "." in fldval:
295 fldval= fldval[:fldval.find(".")]
296 if len(fldval)==0:
297 fldval="0"
298 else:
299 fldval= numfuncs.is_int_get(fldval, "string", True)
300 if fldval.startswith("false:"):
301 fldval="0"
302 elif outfields[ifld].datatype=="real":
303 if len(fldval)==0:
304 fldval="0.00"
305 else:
306 fldval= numfuncs.is_real_get(fldval, "string", True)
307 if fldval.startswith("false:"):
308 fldval="0.00"
309 elif outfields[ifld].fmt_decimal>=0:
310 if "." in fldval:
311 txt1=fldval[fldval.find(".")+1:]
312 txt=fldval[:fldval.find(".")]
313 else:
314 txt=fldval
315 txt1=""
316 n1= len(txt1)- outfields[ifld].fmt_decimal
317 if outfields[ifld].fmt_decimal==0:
318 txt1=""
319 elif n1>0:
320 txt1=txt1[:-n1]
321 elif n1<0:
322 txt1=txt1.rjust(outfields[ifld].fmt_decimal,"0")
323 fldval= txt + "." + txt1
324 elif outfields[ifld].datatype=="bool":
325 fldval= "true" if fldval.lower()=="true" else "false"
326 elif outfields[ifld].datatype=="date":
327 if len(outfields[ifld].fmt_date)>0:
328 if not datefuncs.is_date_format(fldval, outfields[ifld].fmt_date):
329 fldval=""
330
331 #check for embedded delimiter
332 if delimoutchar in fldval:
333 fldval= fldval.replace(delimoutchar, embed_delim)
334
335 if ifld>0:
336 lineout += delimoutchar
337 lineout += fldval
338
339 outrecs.append(lineout)
340 else:
341 lemp += 1
342
343 except (RuntimeError, OSError, ValueError) as err:
344 outrecs.insert(0,"notok:" + str(err))
345 return outrecs
list do_refine(list outfields, list transforms, dict settings, list lookup_dicts, list srcfields, list srcrecs)
Definition refinedata.py:21