21def do_refine(outfields:list, transforms:list, settings:dict, lookup_dicts:list, srcfields:list, srcrecs:list) -> list:
23 Refines a set of data records to correct, normalize, and enrich
25 outfields-list of Field objects for output fields that will be assigned values in each output record.
26 As Field objects, each can specify rules for datatype and format that will be applied after transforms
27 (if defined for the field) or passing through original value (if no transform defined).
29 - datatype: int, real, bool, date, string. For date, there should be an entry specifying the format otherwise it is set to ISO yyyyMMdd
30 - fmt_strlen: integer number of characters (>0) if a fixed size is required. Ignored if < 0
31 - fmt_strcase: (upper, lower, empty)
32 - fmt_strcut: (front, back, empty). Side to cut characters from if it is larger than specified fmt_strlen. Default is back.
33 - fmt_strpad: (front, back, empty). Side to add characters to if it is smaller than specified fmt_strlen. Default is back.
34 - fmt_strpadchar: single character or character alias (-space-, -fslash-, -bslash-, -tab-). Character to add if needed to make too small string meet specified fmt_strlen. Default is _
35 - fmt_decimal: number of decimal digits (0-N). Ignored if < 0
37 + without time part- yyyymmdd, yymmdd, yyyyddmm, yyddmm, mmddyyyy, mmddyy, ddmmyyyy, ddmmyy,
38 + (mmm= month abbreviation like Jan) yyyymmmdd, yyyyddmmm, ddmmmyyyy, ddmmmyy
39 + (month= full month name like January) yyyymonthdd, yyyyddmonth, ddmonthyyyy, ddmonthyy
40 + with time part: suffix to above date formats as- Thhmmss, Thhmm, Thh like mmddyyyyThhmm for 11282024T1407 or 11/28/2024T14:07
41 + (S is space) Shhmmss, Shhmm, Shh like 11-28-2024 14:07
42 + with time zone: if time zone is required at end of time part add suffix Z like mmddyyyyThhmmZ 11282024T1407
44 transforms-list of transform objects
46 settings-dictionary with various settings. required:
47 delim- (delimiter for parsing records) as comma, tab, pipe, caret, hyphen
48 delim_out- optional specified delimiter for output records (default is to use delim) as comma, tab, pipe, caret, hyphen
49 is_quoted - bool whether some field values are enclosed in double quotes to allow delimiter within the field value. Default is True.
50 has_header - bool whether first used line is delimited field titles. Default is true
51 use_comment - bool whether to use comment lines (start with # or //) or ignore them. Default is False so they are ignored.
52 normalize- bool whether to apply datatype and format rules to output field value. Default is true.
53 If datatype is int or real and field value is not numeric then the value will be set to 0 for int and 0.00 for real.
54 If datatype is bool and field value is neither true nor false then value will be set to false.
55 If datatype is date and field value is not in date format then value will be set to empty string.
56 embed_delim- new character(s) for replacing delim when a field contains delim. Default is a space.
58 lookup_dicts-list of LookUpDict objects. These should be made from files or arrays prior to invoking this method
60 srcfields-list of field objects in order correlated to input records when parsed using delimiter specified in settings
62 srcrecs-list of strings each of which is one input record. Default is to ignore empty lines and those
63 beginning with # or // as comments. This can be overidden with the setting useComments.
64 If the setting hasHeader is True (which is default) then the first used line must be a delimited line of field titles.
66 RETURNS outrecs as list of Refined data records including header of delimited fields names. If error, 0th entry will start with notok:
84 doparseerror:bool=
False
94 hash_srcfields:dict={}
95 hash_outfields:dict={}
96 hash_out_to_src:dict={}
97 hash_src_to_out:dict={}
98 hash_out_to_transform:dict={}
101 if outfields
is None or not isinstance(outfields, list):
102 raise ValueError(
"no outfields")
103 if srcfields
is None or not isinstance(srcfields, list):
104 raise ValueError(
"no srcfields")
105 if srcrecs
is None or not isinstance(srcrecs, list):
106 raise ValueError(
"no srcrecs")
107 if settings
is None or not isinstance(settings, dict):
109 if lookup_dicts
is None or not isinstance(lookup_dicts, list):
111 if transforms
is None or not isinstance(transforms, list):
114 for i
in range(len(keys)):
116 txt1= str(settings[keys[i]]).lower()
117 if txt
in (
"isquoted",
"is_quoted"):
118 isquoted = txt1==
"true"
119 elif txt
in (
"hasheader",
"has_header"):
120 hashdr = txt1==
"true"
121 elif txt
in (
"use_comment",
"usecomment"):
122 usecomment = txt1==
"true"
123 elif txt==
"normalize":
124 normalize = txt1==
"true"
127 delimchar= recfuncs.delim_get_char(delim)
128 elif txt
in (
"delim_out",
"delimout"):
130 delimoutchar= recfuncs.delim_get_char(delimout)
131 elif txt
in (
"embed_delim",
"embeddelim"):
132 embed_delim= txt1
if len(txt1)>0
else " "
135 raise ValueError(
"no delim specified")
136 if len(delimchar)==0:
137 raise ValueError(
"unrecognized delim: " + delim)
138 if len(outfields)==0:
139 raise ValueError(
"no output fields specified")
140 if len(srcfields)==0:
141 raise ValueError(
"no source fields specified")
143 raise ValueError(
"no data records supplied")
145 if len(delimout)==0
or len(delimoutchar)==0:
146 delimoutchar= delimchar
147 for i
in range(len(lookup_dicts)):
148 if not isinstance(lookup_dicts[i], lookup.LookUpDict):
149 raise ValueError(
"lookup_dict " + str(i) +
" is not a LookUpDict object")
150 txt= lookup_dicts[i].title.lower().strip()
152 raise ValueError(
"lookup_dict " + str(i) +
" has empty title")
153 elif txt
in hash_lookups:
154 raise ValueError(
"lookup_dict " + str(i) +
" has duplicate title")
156 for i
in range(len(srcfields)):
158 raise ValueError(
"source field " + str(i) +
" is not a Field object")
159 txt= srcfields[i].title.lower().strip()
161 raise ValueError(
"source field " + str(i) +
" has empty title")
162 elif txt
in hash_srcfields:
163 raise ValueError(
"source field " + str(i) +
" has duplicate title")
164 hash_srcfields[txt]=i
165 txt= srcfields[i].parse_error_action.lower().strip()
166 if txt
in [
"-use-",
"-ignore-",
""]:
167 srcfields[i].parse_error_action=txt
168 for i
in range(len(outfields)):
170 raise ValueError(
"output field " + str(i) +
" is not a Field object")
171 txt= outfields[i].title.lower().strip()
173 raise ValueError(
"output field " + str(i) +
" has empty title")
174 elif txt
in hash_outfields:
175 raise ValueError(
"output field " + str(i) +
" has duplicate title")
176 hash_outfields[txt]=i
177 txt1= outfields[i].mapto.lower().strip()
178 if txt
in hash_srcfields:
179 hash_out_to_src[i]=hash_srcfields[txt]
180 hash_src_to_out[hash_srcfields[txt]]=i
181 elif txt1
in hash_srcfields:
182 hash_out_to_src[i]=hash_srcfields[txt1]
183 hash_src_to_out[hash_srcfields[txt1]]=i
185 hash_out_to_src[i]=-1
186 txt= outfields[i].fmt_strcase.lower().strip()
187 outfields[i].fmt_strcase = txt
if txt
in (
"upper",
"lower")
else ""
188 txt= outfields[i].fmt_strcut.lower().strip()
189 outfields[i].fmt_strcut =
"front" if txt
in (
"front",
"left",
"start")
else "back"
190 txt= outfields[i].fmt_strpad.lower().strip()
191 outfields[i].fmt_strpad =
"front" if txt
in (
"front",
"left",
"start")
else "back"
192 txt= outfields[i].fmt_strpadchar.lower().strip()
194 outfields[i].fmt_strpadchar=
"_"
195 elif txt
in (
"space",
"-space-"):
196 outfields[i].fmt_strpadchar=
" "
197 elif txt
in (
"fslash",
"-fslash-"):
198 outfields[i].fmt_strpadchar=
"/"
199 elif txt
in (
"bslash",
"-bslash-"):
200 outfields[i].fmt_strpadchar=
"\\"
201 elif txt
in (
"tab",
"-tab-"):
202 outfields[i].fmt_strpadchar=
"\t"
203 for i
in range(len(transforms)):
204 if not isinstance(transforms[i], transform.Transform):
205 raise ValueError(
"transforms " + str(i) +
" is not a Transform object")
206 txt= transforms[i].title.lower().strip()
208 raise ValueError(
"transforms " + str(i) +
" has empty title")
209 if txt
in hash_outfields:
210 hash_out_to_transform[hash_outfields[txt]]=i
211 elif txt
in hash_srcfields:
212 if hash_srcfields[txt]
in hash_src_to_out:
213 hash_out_to_transform[hash_src_to_out[hash_srcfields[txt]]]=i
214 for j
in range(len(transforms[i].ops)):
215 if transforms[i].ops[j].title.lower().startswith(
"lookup"):
216 if transforms[i].ops[j].param1.lower()
not in hash_lookups:
217 raise ValueError(
"unknown LookUpDict specified in transform " + str(i) +
", op " + str(j) +
": " + transforms[i].ops[j].param1)
220 for i
in range(len(outfields)):
222 lineout += delimoutchar
223 lineout += outfields[i].title
224 outrecs.append(lineout)
226 for i
in range(len(srcrecs)):
228 if len(linein.strip())>0:
230 iscomment = txt.startswith(
"#")
or txt.startswith(
"//")
233 if usecomment
or not iscomment:
235 if lrec>1
or not hashdr:
236 if isquoted
and DQ
in linein:
237 src_values= recfuncs.split_quoted_line(linein, delimchar)
238 if len(src_values)>0
and src_values[0].startswith(
"notok:"):
239 raise ValueError(
"error splitting quoted line #rec=" + str(lrec) +
": " + src_values[0][6:])
241 src_values= linein.split(delimchar)
244 for ifld
in range(len(outfields)):
246 nf=hash_out_to_src[ifld]
249 if nf< len(src_values):
250 fldval=src_values[nf]
251 elif srcfields[nf].parse_error_action==
"-ignore-":
253 elif srcfields[nf].parse_error_action==
"-use-" or len(srcfields[nf].parse_error_action)==0:
256 fldval= srcfields[nf].parse_error_action
261 if ifld
in hash_out_to_transform:
262 if nf< len(src_values)
or doparseerror:
263 nt= hash_out_to_transform[ifld]
264 fldval= exectransform.do_transform(transforms[nt], fldval, outfields[ifld].datatype,
271 if fldval.startswith(
"notok:"):
272 raise ValueError(fldval[6:])
275 if nf<0
or nf< len(src_values)
or doparseerror:
276 if outfields[ifld].datatype==
"string":
277 if outfields[ifld].fmt_strlen>0:
278 n1= len(fldval)- outfields[ifld].fmt_strlen
280 if outfields[ifld].fmt_strcut==
"front":
285 if outfields[ifld].fmt_strpad==
"front":
286 fldval= fldval.ljust(outfields[ifld].fmt_strlen,outfields[ifld].fmt_strpadchar)
288 fldval= fldval.rjust(outfields[ifld].fmt_strlen,outfields[ifld].fmt_strpadchar)
289 if outfields[ifld].fmt_strcase==
"upper":
290 fldval= fldval.upper()
291 elif outfields[ifld].fmt_strcase==
"lower":
292 fldval= fldval.lower()
293 elif outfields[ifld].datatype==
"int":
295 fldval= fldval[:fldval.find(
".")]
299 fldval= numfuncs.is_int_get(fldval,
"string",
True)
300 if fldval.startswith(
"false:"):
302 elif outfields[ifld].datatype==
"real":
306 fldval= numfuncs.is_real_get(fldval,
"string",
True)
307 if fldval.startswith(
"false:"):
309 elif outfields[ifld].fmt_decimal>=0:
311 txt1=fldval[fldval.find(
".")+1:]
312 txt=fldval[:fldval.find(
".")]
316 n1= len(txt1)- outfields[ifld].fmt_decimal
317 if outfields[ifld].fmt_decimal==0:
322 txt1=txt1.rjust(outfields[ifld].fmt_decimal,
"0")
323 fldval= txt +
"." + txt1
324 elif outfields[ifld].datatype==
"bool":
325 fldval=
"true" if fldval.lower()==
"true" else "false"
326 elif outfields[ifld].datatype==
"date":
327 if len(outfields[ifld].fmt_date)>0:
328 if not datefuncs.is_date_format(fldval, outfields[ifld].fmt_date):
332 if delimoutchar
in fldval:
333 fldval= fldval.replace(delimoutchar, embed_delim)
336 lineout += delimoutchar
339 outrecs.append(lineout)
343 except (RuntimeError, OSError, ValueError)
as err:
344 outrecs.insert(0,
"notok:" + str(err))