26 Shifts array of parsed field values to correct having more values than defined fields.
27 Uses algorithm assessing best field and field + 1 (index positions in record) to join
28 based on datatypes, formats and known patterns of root causes of this error.
30 * origvals: array of parsed field values
31 * recflds: list of Field object containing datatype and format specifications
32 * hashrecflds: Dictionary of field title lowercase to its array index
33 * pinflds: optional list of field titles that are pinned meaning their position cannot be changed
35 returns: new array of parsed field values. If error, 0th entry starts with notok:
64 pinshiftscores:list=[]
68 ndelta= len(origvals)- nflds
74 pinindex.append(hashrecflds[s])
75 pinshiftscores.append(0)
77 pinshiftscores.append(0)
78 pinshiftscores.append(0)
80 for i
in range(len(pinindex)):
81 hashpinindex[pinindex[i]]=i
84 for j
in range(nflds):
93 n1 = round(dval * nflds,0)
94 nmaxfthresh= n1
if n1>=1
else 1
97 for h
in range(nflds):
102 for i
in range(len(origvals)):
103 newvals.append(origvals[i])
105 for i
in range(ndelta):
106 txt += joinstr + newvals[i+1+h]
111 for i
in range(h+1,n2+1):
113 txt=
"" if n3>= len(newvals)
else newvals[n3]
119 for i
in range(nflds):
121 if len(elemval)>0
and len(recflds[i].datatype)>0:
122 if recflds[i].datatype.startswith(
"date"):
123 if len(recflds[i].fmt_date)>0
and recfuncs.is_field_its_datatype(recflds[i].datatype, elemval, recflds[i].fmt_date):
125 elif recfuncs.is_field_its_datatype(recflds[i].datatype, elemval):
127 if recflds[i].datatype==
"real":
128 if recflds[i].fmt_decimal>0:
129 if recfuncs.is_field_its_format(elemval, recflds[i],
False).startswith(
"true"):
131 elif recflds[i].datatype==
"string":
132 if recflds[i].fmt_strcase
in (
"upper",
"lower")
or recflds[i].fmt_strlen>0:
133 if recfuncs.is_field_its_format(elemval, recflds[i],
False).startswith(
"true"):
141 n2 = 10*ndt + round(10*nf/nflds)
142 if h
in hashpinindex:
151 for i
in range(len(origvals)):
152 newvals.append(origvals[i])
153 if len(pinindex)==0
or nmaxf>=nmaxfthresh:
155 n1= nindexf
if nindexf>=0
else nindexdt
157 for i
in range(ndelta):
158 txt += joinstr + newvals[i+1+n1]
163 for i
in range(n1+1,n2+1):
165 txt=
"" if n3>= len(newvals)
else newvals[n3]
168 elif len(pinindex)==1:
172 for i
in range(ndelta):
173 txt += joinstr + newvals[i+1+n1]
177 for i
in range(n1+1,n2+1):
179 txt=
"" if n3>= len(newvals)
else newvals[n3]
181 elif len(pinindex)>1:
190 for h
in range(len(pinindex)+2):
192 for i
in range(len(origvals)):
193 newvals.append(origvals[i])
199 for i
in range(ndelta):
200 txt += joinstr + newvals[i+1+n1]
203 for i
in range(n1+1,n2+1):
205 txt=
"" if n3>= len(newvals)
else newvals[n3]
210 for i
in range(nflds):
212 if len(recflds[i].datatype)>0:
213 if recfuncs.is_field_its_datatype(recflds[i].datatype, elemval, recflds[i].fmt_date):
215 if recflds[i].datatype==
"real":
216 if recflds[i].fmt_decimal>0:
217 if recfuncs.is_field_its_format(elemval, recflds[i],
False).startswith(
"true"):
219 elif recflds[i].datatype==
"string":
220 if recflds[i].fmt_strcase
in (
"upper",
"lower")
or recflds[i].fmt_strlen>0:
221 if recfuncs.is_field_its_format(elemval, recflds[i],
False).startswith(
"true"):
231 for i
in range(nflds):
232 pinrecs[0][i]=newvals[i]
234 elif h==len(pinindex):
236 if len(pinindex)>= ndelta:
237 ndist= math.floor(ndelta/len(pinindex))
238 for p
in range(len(pinindex)):
241 n4= ndist
if p<len(pinindex)-1
else ndelta-ntot
242 for i
in range(1, n4+1):
244 txt += joinstr + newvals[i+n1]
247 for i
in range(n1+1,n2+1):
249 txt=
"" if n3>= len(newvals)
else newvals[n3]
254 for i
in range(nflds):
256 if len(recflds[i].datatype)>0:
257 if recfuncs.is_field_its_datatype(recflds[i].datatype, elemval, recflds[i].fmt_date):
259 if recflds[i].datatype==
"real":
260 if recflds[i].fmt_decimal>0:
261 if recfuncs.is_field_its_format(elemval, recflds[i],
False).startswith(
"true"):
263 elif recflds[i].datatype==
"string":
264 if recflds[i].fmt_strcase
in (
"upper",
"lower")
or recflds[i].fmt_strlen>0:
265 if recfuncs.is_field_its_format(elemval, recflds[i],
False).startswith(
"true"):
276 for i
in range(nflds):
277 pinrecs[1][i]=newvals[i]
279 elif h==len(pinindex)+1:
281 for p
in range(len(pinindex)):
285 n4= (pinindex[p+1]-pinindex[p]-1)
if p<len(pinindex)-1
else ndelta-ntot
286 for i
in range(1, n4+1):
288 txt += joinstr + newvals[i+n1]
291 for i
in range(n1+1,n2+1):
293 txt=
"" if n3>= len(newvals)
else newvals[n3]
298 for i
in range(nflds):
300 if len(recflds[i].datatype)>0:
301 if recfuncs.is_field_its_datatype(recflds[i].datatype, elemval, recflds[i].fmt_date):
303 if recflds[i].datatype==
"real":
304 if recflds[i].fmt_decimal>0:
305 if recfuncs.is_field_its_format(elemval, recflds[i],
False).startswith(
"true"):
307 elif recflds[i].datatype==
"string":
308 if recflds[i].fmt_strcase
in (
"upper",
"lower")
or recflds[i].fmt_strlen>0:
309 if recfuncs.is_field_its_format(elemval, recflds[i],
False).startswith(
"true"):
320 for i
in range(nflds):
321 pinrecs[2][i]=newvals[i]
326 if nindexf<len(pinindex):
328 for i
in range(len(pinrecs[0])):
329 newvals.append(pinrecs[0][i])
330 elif nindexf==len(pinindex):
332 for i
in range(len(pinrecs[1])):
333 newvals.append(pinrecs[1][i])
334 elif nindexf==len(pinindex)+1:
336 for i
in range(len(pinrecs[2])):
337 newvals.append(pinrecs[2][i])
341 for i
in range(nflds):
342 outvals.append(newvals[i])
344 for i
in range(nflds):
345 outvals.append(origvals[i])
347 except (RuntimeError, OSError, ValueError)
as err:
348 outvals.insert(0,
"notok:" + str(err))
352 """Fix Record Field Splitting
354 Repairs records with parsing problems having too many or too few field values relative to defined
355 number of fields. This occurs due to embedded delimiters in some field values causing too many parsed values,
356 and records broken across multiple lines causing too few values. These situations are categorized into 3 types:
357 1) big (too many parsed values), 2) small1 (1 too few values), 3) small2 (2 or more too few values). small1 is a
358 special case since some data systems purposefully eliminate the fnal field value in a record if it is empty by
359 making it null thereby saving storage and memory space. In this case, the record is actually fine but is missing its final
360 field value. This can be accepted by having the setting 'allowLastEmpty' = TRUE leading to a default value assigned
361 to this last record field based on datatype: int and real assigned 0, bool assigned FALSE, others assigned empty string.
363 * settings: Dictionary of setting parameters. Both key and value are strings. Settings:
364 - allow_last_empty: bool whether to allow last field to be empty (i.e. small1 parsing) and assign it default value. Default is TRUE
365 - is_quoted: bool whether fields values may be enclosed by double quotes as is common when data exported from SpreadSheets and some databases. Default is FALSE.
366 - has_header: bool whether first non-empty record is a header row of delimited field titles. Default is FALSE.
367 - ignore_empty: bool whether to ignore empty records. Default is TRUE.
368 - pin_fields: field titles delimited by pipe (if more than 1) that are pinned meaning if record has too many fields (i.e. big) then these fields will not shifted as
369 the algorithm finds the best way to merge values to make corrected record
370 - ignore_start_str: string parts delimited by pipe (if more than 1) that will cause records starting with any one of them to be ignored. Always case insensitive.
371 A common use for this is to filter out comment lines such as those starting with # or // in which case set to #|//
372 - delim: name of delimiter to use to parse records: comma, pipe, tab, colon, caret. This is required.
373 - join_char: token (max 10 chars) or alias to insert when joining lines to remediate parsing. Default is to use nothing.
374 Aliases include: -comma-, -tab-, -pipe-, -space-, -bslash-, -fslash-, -lparen-, -rparen-,
375 -lcurly-, -rcurly-, -lsquare-, -rsquare-, -dblquote-, -mathpi-, -mathe-
376 * srcfields: list of Field objects comprising records
377 * srcrecs: list of string source records
379 returns: list of new records. If error, 0th entry starts with notok: otherwise 0th entry is string of stats,
380 entry[1] is header of field titles. For stats, examples may have prefix of (nline) with nline being the line number read
381 (excluding empty and comments lines) and is therefore 1 larger than the line's index
382 in the Python list (i.e. nline is 1 based while lists are 0-based).
408 doing_join:bool=
False
409 allow_last_empty:bool=
False
412 ignore_empty:bool=
False
419 ignore_start_str:list=[]
421 fld_is_datatype:list=[]
426 nflds_act=len(srcfields)
428 raise ValueError(
"no fields supplied")
430 raise ValueError(
"no records supplied")
431 for i
in range(nflds_act):
433 raise ValueError(
"source field is not Field object at index " + str(i))
434 elem=srcfields[i].title.lower().strip()
436 raise ValueError(
"source field title is empty at index " + str(i))
438 raise ValueError(
"duplicate source field title " + elem)
440 srcfields[i].title=srcfields[i].title.strip()
441 srcfields[i].datatype=srcfields[i].datatype.lower().strip()
443 for k,v
in settings.items():
444 if isinstance(k,str):
447 if txt
in (
"allow_last_empty",
"allowlastempty"):
448 allow_last_empty= txt1==
"true"
449 elif txt
in (
"isquoted",
"is_quoted"):
450 isquoted= txt1==
"true"
451 elif txt
in (
"hasheader",
"has_header"):
452 hasheader= txt1==
"true"
453 elif txt
in (
"ignore_empty",
"ignoreempty"):
454 ignore_empty= txt1==
"true"
455 elif txt
in (
"ignore_start_str",
"ignorestartstr"):
460 ignore_start_str.append(txt)
461 elif txt
in (
"pin_fields",
"pinfields"):
465 if len(txt)>0
and txt
not in hashpinflds:
467 hashpinflds[txt]= len(pinflds)-1
470 raise ValueError(
"empty delim specified")
472 delimchar= recfuncs.delim_get_char(txt1)
473 if delimchar.startswith(
"notok:"):
474 raise ValueError(delimchar[6:])
475 elif len(delimchar)==0:
476 raise ValueError(
"unknown delim specified:" + txt1)
477 elif txt
in (
"join_char",
"joinchar"):
479 if join_char.startswith(
"-")
and join_char.endswith(
"-"):
480 join_char= recfuncs.convert_char_aliases(join_char)
482 if len(join_char)>10:
483 join_char=join_char[:10]
485 if len(delimchar)==0:
486 raise ValueError(
"no delim specified:" + txt1)
489 for i
in range(nflds_act):
492 recout += srcfields[i].title
493 outrecs.append(recout)
495 for nidx
in range(len(srcrecs)):
498 if len(recin.strip())==0:
500 userec=
not ignore_empty
501 elif len(ignore_start_str)>0:
502 recin_lc= recin.lower()
503 for s
in ignore_start_str:
512 if lrecsin>1
or not hasheader:
513 if isquoted
and DQ
in recin:
514 fldvals= recfuncs.split_quoted_line(recin, delimchar)
515 if len(fldvals)>0
and fldvals[0].startswith(
"notok:"):
516 raise ValueError(
"error splitting line index " + str(nidx) +
":" + fldvals[6:])
518 fldvals= recin.split(delimchar)
519 nflds_new= len(fldvals)
520 ndelta= nflds_new-nflds_act
523 if ndelta>=0
or nflds_prior+nflds_new-1 > nflds_act:
527 for i
in range(nflds_act-nflds_prior):
528 fldvalsprior.append(
"")
529 recout= delimchar.join(fldvalsprior)
530 outrecs.append(recout)
542 for i
in range(nflds_new):
544 fldvalsprior[nflds_prior-1]+= join_char + fldvals[i]
546 fldvalsprior.append(fldvals[i])
549 ndelta= len(fldvalsprior)- nflds_act
551 recout= delimchar.join(fldvalsprior)
552 outrecs.append(recout)
558 if not doing_join
and not didsave:
560 outrecs.append(recin)
566 if len(temp)== nflds_act:
570 recout= delimchar.join(temp)
571 outrecs.append(recout)
574 elif ndelta== -1
and allow_last_empty:
575 recout = recin + delimchar
576 outrecs.append(recout)
584 if nidx==len(srcrecs)-1:
586 for i
in range(nflds_act-nflds_new):
587 fldvalsprior.append(
"")
588 recout= delimchar.join(fldvalsprior)
589 outrecs.append(recout)
594 nflds_prior=nflds_new
600 txt =
"ok:recin=" + str(lrecsin) +
",recout=" + str(lrecsout) +
",big=" + str(lbig) +
",small1=" + str(lsm1) +
",small2=" + str(lsm2) +
",join=" + str(ljoin) +
",fix=" + str(lfix) +
",notfix=" + str(lnotfix)
601 outrecs.insert(0,txt)
602 except (RuntimeError, OSError, ValueError)
as err:
603 outrecs.insert(0,
"notok:" + str(err))