VerityPy 1.1
Python library for Verity data profiling, quality control, remediation
lookup.py
Go to the documentation of this file.
1#!/usr/bin/env python
2"""
3Lookup dictionary functions
4
5Lookup dictionary object for transform processing. Transforms have an
6operation (Op) that allows assigning a value based on looking up
7the current value in a dictionary. 1, 2, 3 keys are allowed with
8the replacement value coming from the following column. This Op is in
9transform_types for category=assignment, function=lookup.
10
11The description of this function and how it uses the lookup is:
12
13 Assigns a value from a lookup list based on matching values to keys where keys can use wildcards.
14 The match can be made to one, two, or three source fields in the record with field 1 always
15 the current value while fields 2 and 3 are optional if set in Param2.
16 Leave Param2 empty to use only 1 field as the match value.
17 All selected fields must match their respective conditions for a lookup result to be assigned.
18 The conditions and the result to assign are in an object for each list entry
19 with properties: key1, key2, key3, value defined as example
20 {'key1':'top*','key2':'*blue*','key3':'*left','value':'Orange'}.
21 Conditions can use front and/or back wildcard (*) like top*, *night, *state* to allow token matching.
22 To use multiple conditions for the same replacement value ( OR condition ),
23 enter them as separate list entries.
24 To use AND and NOT conditions, use special notations as delimiters:
25 top*-and-*night-not-*blue* which means a match requires both top* and *night be true
26 as well as no instances of the token blue.
27
28 Param1: title of list that has been pre-loaded into array of lists as part of initialization.
29 Param2: Fields 2 and 3 both of which are optional and if both supplied use
30 pipe to delimit as with color|position.
31 For this example, current value must start with top (key1 condition), the field color
32 must contain blue, and the field position must end with left.
33 All of these must be true for a match in which case the value of Orange is assigned.
34"""
35
36
37__all__ = ['LookUpRec',
38 'LookUpDict',
39 'make_lookups',
40 'make_lookup_from_list',
41 ]
42__version__ = '1.0'
43__author__ = 'Geoffrey Malafsky'
44__email__ = 'gmalafsky@technikinterlytics.com'
45__date__ = '20240616'
46
47from ..processing import recfuncs
48
50 """
51 Record within a lookup dictionary
52
53 Each lookup record has 1-3 keys with matching conditions. Each key value can use
54 wildcards at front and/or back. In addition, each key can have a combined condition
55 using more than one token joined using special notation for boolean AND and NOT conditions.
56 For example, a key might be top*-and-*food*-not-*juice which means the field value being
57 checked muct statisfy starting with 'top', containing 'food', and not ending with
58 'juice'. Therefore, each key is parsed into lists that are aligned:
59 AND tokens and for each corresponding front_wild and back_wild lists. Similarly, NOT
60 tokens and wild lists. After parsing, key1_and is a list of tokens minus front and back
61 wildcards if they were supplied, and if so, they are in correlated lists key1_and_front_wild
62 and key1_and_back_wild.
63
64 key1: single or combined value(s) to check
65 key2: optional. single or combined value(s) to check
66 key3: optional. single or combined value(s) to check
67 key1_and: list of AND conditions for key1 (at least 1)
68 key1_not: list of NOT conditions for key1 (0 or more)
69 key2_and: optional. list of AND conditions for key2 (at least 1 if used)
70 key2_not: optional. list of NOT conditions for key2 (0 or more)
71 key3_and: optional. list of AND conditions for key3 (at least 1 if used)
72 key3_not: optional. list of NOT conditions for key3 (0 or more)
73 key1_and_front_wild: bool for key1_and entry if it had front wildcard *
74 key1_and_back_wild: bool for key1_and entry if it had back wildcard *
75 key1_not_front_wild: bool for key1_not entry if it had front wildcard *
76 key1_not_back_wild: bool for key1_not entry if it had back wildcard *
77 key2_and_front_wild: bool for key2_and entry if it had front wildcard *
78 key2_and_back_wild: bool for key2_and entry if it had back wildcard *
79 key2_not_front_wild: bool for key2_not entry if it had front wildcard *
80 key2_not_back_wild: bool for key2_not entry if it had back wildcard *
81 key3_and_front_wild: bool for key3_and entry if it had front wildcard *
82 key3_and_back_wild: bool for key3_and entry if it had back wildcard *
83 key3_not_front_wild: bool for key3_not entry if it had front wildcard *
84 key3_not_back_wild: bool for key3_not entry if it had back wildcard *
85 result: string final value
86 """
87
88 key1:str
89 key2:str
90 key3:str
91 key1_and: list
92 key1_not: list
93 key2_and: list
94 key2_not: list
95 key3_and: list
96 key3_not: list
97 key1_and_front_wild: list
98 key1_and_back_wild: list
99 key1_not_front_wild: list
100 key1_not_back_wild: list
101 key2_and_front_wild: list
102 key2_and_back_wild: list
103 key2_not_front_wild: list
104 key2_not_back_wild: list
105 key3_and_front_wild: list
106 key3_and_back_wild: list
107 key3_not_front_wild: list
108 key3_not_back_wild: list
109 result:str
110
137 """
138 Dictionary with keys (either 1,2,3 field values) mapped to replacement value. The
139 keys can use wild cards and also special notations for AND and NOT conditions.
140
141 title: used in function 'lookup' to select which LookUpDict to use
142 file_uri: URI to file. Empty if made from list object.
143 is_case_sens: if false then all text changed to lowercase
144 num_keys: integer number of keys used 1-3
145 delim: delimiter (comma, pipe, tab, colon)
146 fields: list of field titles which must correspond to columns in data set
147 recs:list of LookUpRec objects
148 """
149
150 title:str
151 file_uri:str
152 is_case_sens:bool
153 num_keys:int
154 delim:str
155 fields:list
156 recs:list
157
168def make_lookups(lookups:list) -> dict:
169 """
170 Makes lookup dicts from files supplied in list of LookUpDict objects.
171
172 lookups: list of LookUpDict objects each of which must have its title, file_uri, num_keys.
173 delim and is_case_sens are optional.
174 The text file must have a header line as the first non-empty and non-comment (comment lines begin with either // or #) line.
175 Field names delimited by parameter 'delim'.
176 There must be num_keys + 1 fields in the header and each record.
177 The num_keys + 1 field is the result when a match occurs.
178
179 returns object with keys:
180 dicts: list of LookUpDict objects made from supplied list with lists of fields and recs filled.
181 hash: dictionary as hash table of LookUpDicts with key = each LookUpDict title and value= index in 'dicts' array
182 """
183
184 lkdict:LookUpDict
185 lookup_dicts=[]
186 hash_lookup_dicts={}
187 try:
188 for i in range(len(lookups)):
189 try:
190 lkdict=LookUpDict()
191 if not isinstance(lookups[i],LookUpDict):
192 raise ValueError("not lookup dict index: " + str(i))
193 lkdict=lookups[i]
194 if len(lkdict.file_uri)==0:
195 raise ValueError('lookupdict missing fileuri at index: ' + str(i))
196 if len(lkdict.title)==0:
197 raise ValueError('lookupdict missing title at index: ' + str(i))
198 if len(lkdict.title)>80:
199 raise ValueError('lookupdict title too long: ' + str(lkdict.title))
200 if lkdict.num_keys<1:
201 raise ValueError('lookupdict too few number keys: ' + str(lkdict.num_keys))
202 if lkdict.num_keys>3:
203 raise ValueError('lookupdict too many number keys: ' + str(lkdict.num_keys))
204 if len(lkdict.delim)==0:
205 raise ValueError('lookupdict missing delimiter at index: ' + str(i))
206
207 lkdict= make_lookup_from_file(lkdict.title,
208 lkdict.file_uri,
209 lkdict.delim,
210 lkdict.is_case_sens,
211 lkdict.num_keys
212 )
213
214 except (RuntimeError, ValueError, OSError, TypeError) as err1:
215 lkdict.title= "notok:" + str(err1)
216
217 lookup_dicts.append(lkdict)
218 if not lkdict.title.startswith("notok:"):
219 hash_lookup_dicts[lkdict.title.lower()]=i
220
221 except (RuntimeError,OSError) as err:
222 print("ERROR:" + str(err))
223
224 return {'dicts':lookup_dicts,'hash':hash_lookup_dicts}
225
226
227def make_lookup_from_file(title:str,file_uri:str,delim:str,is_case_sens:bool,num_keys:int) -> LookUpDict:
228 """
229 Builds a LookUpDict from text file
230
231 File will be read and columns extracted by splitting lines with delimiter.
232 Empty and comment (begins with # or //) lines are ignored.
233 First data containing line must contain delimited field names.
234 num_keys specifies how many columns will be used as keys (1-3) and then the next column will be used as value.
235 This number of columns must be present after splitting.
236 If double quote is in line then more precise (but slower) column separation will be used.
237 LookupDict will have its fields and recs lists filled.
238
239 Returns LookUpDict object whose title will start with notok: if there is an error.
240 """
241
242 lkdict:LookUpDict= LookUpDict()
243 lkrec:LookUpRec
244 err_msg:str=""
245 nline:int=0
246 cols:list=[]
247 delim_char:str=""
248 dq:str= "\""
249 try:
250 if len(title)==0:
251 raise ValueError("title is empty")
252 if num_keys<=0 or num_keys>3:
253 raise ValueError("num_keys is not 1-3: " + str(num_keys))
254 if len(delim)==0:
255 raise ValueError("delim is empty")
256 delim_char= recfuncs.delim_get_char(delim)
257 lkdict.title=title
258 lkdict.num_keys=num_keys
259 lkdict.is_case_sens=is_case_sens
260 lkdict.file_uri=file_uri
261 lkdict.delim=delim
262
263 with open(file_uri, "r", encoding="utf-8") as f:
264 for line in f:
265 if line is None:
266 break
267 line= line.strip()
268 if line.endswith("\n"):
269 line=line[:len(line)-1]
270 if len(line)>0 and not line.startswith("#") and not line.startswith("//"):
271 nline += 1
272 if nline==1:
273 if line.find(delim_char)<1:
274 raise ValueError("first data line does not contain delimiter: " + delim_char)
275 if line.find(dq)>-1:
276 line=line.replace(dq,'')
277 lkdict.fields= line.split(delim_char)
278 if len(lkdict.fields)< (num_keys+1):
279 raise ValueError("too few fields found with delim=" + delim_char + ",#fields=" + len(lkdict.fields)\
280 + "/" + str(num_keys+1))
281 else:
282 if line.find(dq)>-1:
283 cols=recfuncs.split_quoted_line(line,delim_char)
284 if len(cols)>0 and cols[0].startswith("notok:"):
285 raise RuntimeError("error splitting quoted string: " + cols[0][6:])
286 else:
287 cols=line.split(delim_char)
288
289 if len(cols)< (num_keys+1):
290 raise ValueError("too few fields found at nline=" + str(nline) + ",#fields=" + len(cols)\
291 + "/" + str(num_keys+1))
292 lkrec=LookUpRec()
293 lkrec.key1= cols[0]
294 if num_keys>=2:
295 lkrec.key2=cols[1]
296 if num_keys>=3:
297 lkrec.key3=cols[2]
298 lkrec.result=cols[num_keys]
299
300 if not is_case_sens:
301 lkrec.key1= lkrec.key1.lower()
302 if num_keys>=2:
303 lkrec.key2= lkrec.key2.lower()
304 if num_keys>=3:
305 lkrec.key3= lkrec.key3.lower()
306
307 lkdict.recs.append(lkrec)
308 lkdict= extract_lookup_record_key_info(lkdict)
309 if lkdict.title.startswith("notok:"):
310 raise ValueError("Error extracting record keys: " + lkdict.title[6:])
311
312 except (RuntimeError, ValueError, OSError) as err:
313 err_msg= "notok:" + str(err)
314
315 if len(err_msg)>0:
316 lkdict.title=err_msg
317 return lkdict
318
319
320def make_lookup_from_list(title:str,lkup_list:list,delim:str,is_case_sens:bool,num_keys:int) -> LookUpDict:
321 """
322 Builds a LookUpDict from list that contains what would be read from a file.
323
324 List will be read and columns extracted by splitting lines with delimiter.
325 Empty and comment (begins with # or //) lines are ignored.
326 First data containing line must contain delimited field names.
327 num_keys specifies how many columns will be used as keys (1-3) and then the next column will be used as value.
328 This number of columns must be present after splitting.
329 If double quote is in line then more precise (but slower) column separation will be used.
330 LookupDict will have its fields and recs lists filled.
331
332 Returns LookUpDict object whose title will start with notok: if there is an error.
333 """
334
335 lkdict:LookUpDict
336 lkrec:LookUpRec
337 err_msg:str=""
338 nline:int=0
339 cols:list=[]
340 delim_char:str=""
341 dq:str= "\""
342 linein:str=""
343 try:
344 if len(title)==0:
345 raise ValueError("title is empty")
346 if num_keys<=0 or num_keys>3:
347 raise ValueError("num_keys is not 1-3: " + str(num_keys))
348 if len(delim)==0:
349 raise ValueError("delim is empty")
350 delim_char= recfuncs.delim_get_char(delim)
351 lkdict= LookUpDict()
352 lkdict.title=title
353 lkdict.num_keys=num_keys
354 lkdict.is_case_sens=is_case_sens
355 lkdict.file_uri=""
356 lkdict.delim=delim
357
358 for line in lkup_list:
359 linein=line
360 if linein is None:
361 break
362 linein= linein.strip()
363 if linein.endswith("\n"):
364 linein=linein[:-1]
365 if len(linein)>0 and not linein.startswith("#") and not linein.startswith("//"):
366 nline += 1
367 if nline==1:
368 if linein.find(delim_char)<1:
369 raise ValueError("first data line does not contain delimiter: " + delim_char)
370 if linein.find(dq)>-1:
371 linein=linein.replace(dq,'')
372 lkdict.fields= linein.split(delim_char)
373 if len(lkdict.fields)< (num_keys+1):
374 raise ValueError("too few fields found with delim=" + delim_char + ",#fields=" + len(lkdict.fields)\
375 + "/" + str(num_keys+1))
376 else:
377 if linein.find(dq)>-1:
378 cols=recfuncs.split_quoted_line(line,delim_char)
379 if len(cols)>0 and cols[0].startswith("notok:"):
380 raise RuntimeError("error splitting quoted string: " + cols[0][6:])
381 else:
382 cols=linein.split(delim_char)
383
384 if len(cols)< (num_keys+1):
385 raise ValueError("too few fields found at nline=" + str(nline) + ",#fields=" + len(cols)\
386 + "/" + str(num_keys+1))
387 lkrec=LookUpRec()
388 lkrec.key1= cols[0]
389 if num_keys>=2:
390 lkrec.key2=cols[1]
391 if num_keys>=3:
392 lkrec.key3=cols[2]
393 lkrec.result=cols[num_keys]
394 if not is_case_sens:
395 lkrec.key1= lkrec.key1.lower()
396 if num_keys>=2:
397 lkrec.key2= lkrec.key2.lower()
398 if num_keys>=3:
399 lkrec.key3= lkrec.key3.lower()
400
401 lkdict.recs.append(lkrec)
402
403 lkdict= extract_lookup_record_key_info(lkdict)
404 except (RuntimeError, ValueError, OSError) as err:
405 err_msg= "notok:" + str(err)
406
407 if len(err_msg)>0:
408 lkdict.title=err_msg
409 return lkdict
410
411
412def extract_lookup_record_key_info(lkdict:LookUpDict) -> LookUpDict:
413 """
414 Extracts lookup key information and parses into various arrays tokens, wildcards,
415 and boolean AND and NOT conditions to accelerate matching.
416
417 lkdict: LookUpDict object to process
418
419 Returns modified LookUpDict object with parsed record information. Title will start with notok: if there is an error.
420 """
421
422 err_msg:str=""
423 key_str:str=""
424 key_str_typ:str=""
425 str_temp:str=""
426 nand:int=-1
427 nnot:int=-1
428 front_wild:bool=False
429 back_wild:bool=False
430 temp_list:list=[]
431 nkeyand:int=-1
432 nkeynot:int=-1
433 new_recs:list=[]
434 nrec:int=-1
435 try:
436 for i in range(len(lkdict.recs)):
437 new_recs.append(LookUpRec())
438 nrec= len(new_recs)-1
439 new_recs[nrec].result= lkdict.recs[i].result
440 for j in range(lkdict.num_keys):
441 key_str=""
442 if j==0:
443 key_str= lkdict.recs[i].key1
444 new_recs[nrec].key1=key_str
445 elif j==1:
446 key_str= lkdict.recs[i].key2
447 new_recs[nrec].key2=key_str
448 elif j==2:
449 key_str= lkdict.recs[i].key3
450 new_recs[nrec].key3=key_str
451
452 while "-and-" in key_str or "-not-" in key_str:
453 if key_str.startswith("-and-") or key_str.startswith("-not-"):
454 if key_str.startswith("-and-"):
455 key_str_typ="and"
456 else:
457 key_str_typ="not"
458 key_str=key_str[5:]
459 else:
460 # no prefix for this fragment so assign as AND
461 key_str_typ="and"
462
463 nand= key_str.find("-and-")
464 nnot= key_str.find("-not-")
465 if nand>0 and (nnot<0 or nand<nnot):
466 str_temp= key_str[:nand]
467 key_str= key_str[nand:]
468 elif nnot>0 and (nand<0 or nnot<nand):
469 str_temp= key_str[:nnot]
470 key_str= key_str[nnot:]
471 else:
472 str_temp=key_str
473 key_str=""
474
475 if key_str_typ=="and":
476 if j==0:
477 new_recs[i].key1_and.append(str_temp)
478 nkeyand=len(new_recs[i].key1_and)-1
479 new_recs[i].key1_and_front_wild.append(False)
480 new_recs[i].key1_and_back_wild.append(False)
481 elif j==1:
482 new_recs[i].key2_and.append(str_temp)
483 nkeyand=len(new_recs[i].key2_and)-1
484 new_recs[i].key2_and_front_wild.append(False)
485 new_recs[i].key2_and_back_wild.append(False)
486 elif j==2:
487 new_recs[i].key3_and.append(str_temp)
488 nkeyand=len(new_recs[i].key3_and)-1
489 new_recs[i].key3_and_front_wild.append(False)
490 new_recs[i].key3_and_back_wild.append(False)
491 elif key_str_typ=="not":
492 if j==0:
493 new_recs[i].key1_not.append(str_temp)
494 nkeynot=len(new_recs[i].key1_not)-1
495 new_recs[i].key1_not_front_wild.append(False)
496 new_recs[i].key1_not_back_wild.append(False)
497 elif j==1:
498 new_recs[i].key2_not.append(str_temp)
499 nkeynot=len(new_recs[i].key2_not)-1
500 new_recs[i].key2_not_front_wild.append(False)
501 new_recs[i].key2_not_back_wild.append(False)
502 elif j==2:
503 new_recs[i].key3_not.append(str_temp)
504 nkeynot=len(new_recs[i].key3_not)-1
505 new_recs[i].key3_not_front_wild.append(False)
506 new_recs[i].key3_not_back_wild.append(False)
507
508 if len(key_str)>0:
509 if j==0:
510 new_recs[i].key1_and.append(key_str)
511 nkeyand=len(new_recs[i].key1_and)-1
512 new_recs[i].key1_and_front_wild.append(False)
513 new_recs[i].key1_and_back_wild.append(False)
514 elif j==1:
515 new_recs[i].key2_and.append(key_str)
516 nkeyand=len(new_recs[i].key2_and)-1
517 new_recs[i].key2_and_front_wild.append(False)
518 new_recs[i].key2_and_back_wild.append(False)
519 elif j==2:
520 new_recs[i].key3_and.append(key_str)
521 nkeyand=len(new_recs[i].key3_and)-1
522 new_recs[i].key3_and_front_wild.append(False)
523 new_recs[i].key3_and_back_wild.append(False)
524 key_str=""
525
526 temp_list=[]
527 if j==0:
528 temp_list=new_recs[i].key1_and
529 elif j==1:
530 temp_list=new_recs[i].key2_and
531 elif j==2:
532 temp_list=new_recs[i].key3_and
533
534 nkeyand=-1
535 for s in temp_list:
536 nkeyand += 1
537 front_wild=False
538 back_wild=False
539 if len(s)==0:
540 front_wild=True
541 back_wild=True
542 key_str=""
543 elif s.startswith("*"):
544 key_str=s[1:]
545 front_wild=True
546 if len(key_str)==0:
547 back_wild=True
548 elif key_str.endswith("*"):
549 key_str=key_str[:len(key_str)-1]
550 back_wild=True
551 elif s.endswith("*"):
552 key_str=s[:len(s)-1]
553 back_wild=True
554 else:
555 key_str=s
556
557 if j==0:
558 new_recs[i].key1_and[nkeyand]=key_str
559 new_recs[i].key1_and_front_wild[nkeyand]=front_wild
560 new_recs[i].key1_and_back_wild[nkeyand]=back_wild
561 elif j==1:
562 new_recs[i].key2_and[nkeyand]=key_str
563 new_recs[i].key2_and_front_wild[nkeyand]=front_wild
564 new_recs[i].key2_and_back_wild[nkeyand]=back_wild
565 elif j==2:
566 new_recs[i].key3_and[nkeyand]=key_str
567 new_recs[i].key3_and_front_wild[nkeyand]=front_wild
568 new_recs[i].key3_and_back_wild[nkeyand]=back_wild
569
570 temp_list=[]
571 if j==0:
572 temp_list=new_recs[i].key1_not
573 elif j==1:
574 temp_list=new_recs[i].key2_not
575 elif j==2:
576 temp_list=new_recs[i].key3_not
577
578 nkeynot=-1
579 for s in temp_list:
580 nkeynot += 1
581 front_wild=False
582 back_wild=False
583 if len(s)==0:
584 front_wild=True
585 back_wild=True
586 key_str=""
587 elif s.startswith("*"):
588 key_str=s[1:]
589 front_wild=True
590 if len(key_str)==0:
591 back_wild=True
592 elif key_str.endswith("*"):
593 key_str=key_str[:-1]
594 back_wild=True
595 elif s.endswith("*"):
596 key_str=s[:len(s)-1]
597 back_wild=True
598 else:
599 key_str=s
600
601 if j==0:
602 new_recs[i].key1_not[nkeynot]=key_str
603 new_recs[i].key1_not_front_wild[nkeynot]=front_wild
604 new_recs[i].key1_not_back_wild[nkeynot]=back_wild
605 elif j==1:
606 new_recs[i].key2_not[nkeynot]=key_str
607 new_recs[i].key2_not_front_wild[nkeynot]=front_wild
608 new_recs[i].key2_not_back_wild[nkeynot]=back_wild
609 elif j==2:
610 new_recs[i].key3_not[nkeynot]=key_str
611 new_recs[i].key3_not_front_wild[nkeynot]=front_wild
612 new_recs[i].key3_not_back_wild[nkeynot]=back_wild
613
614 lkdict.recs.clear()
615 for rec in new_recs:
616 lkdict.recs.append(rec)
617
618 except (RuntimeError, ValueError, OSError) as err:
619 err_msg= "notok:" + str(err)
620
621 if len(err_msg)>0:
622 lkdict.title=err_msg
623 return lkdict
LookUpDict make_lookup_from_file(str title, str file_uri, str delim, bool is_case_sens, int num_keys)
Definition lookup.py:227
dict make_lookups(list lookups)
Definition lookup.py:168
LookUpDict make_lookup_from_list(str title, list lkup_list, str delim, bool is_case_sens, int num_keys)
Definition lookup.py:320
LookUpDict extract_lookup_record_key_info(LookUpDict lkdict)
Definition lookup.py:412