VerityPy 1.1
Python library for Verity data profiling, quality control, remediation
transform.py
Go to the documentation of this file.
1#!/usr/bin/env python
2"""
3Class definition of Transform and its child Op, with functions
4
5Functions to read and write transforms in text file, pre-process
6transforms to make lists of lookup dicts used,
7as well as fields referenced.
8"""
9
10__all__ = ['Op',
11 'Transform',
12 'read_transforms_from_file',
13 'write_transforms_to_file',
14 'extract_lookup_titles',
15 'set_lookup_fields',
16 'extract_refs',
17 ]
18__version__ = '1.0'
19__author__ = 'Geoffrey Malafsky'
20__email__ = 'gmalafsky@technikinterlytics.com'
21__date__ = '20240721'
22
23
24from ..processing import numfuncs
25
26
27class Op:
28 """
29 Operation (Op) for transforms.
30
31 Each Op has a title which is the name of the function.
32 It specifies the function and its category along with several possible
33 parameters (Param1, Param2, Param3) that depend on the specific
34 function.
35 """
36
37 title:str
38 category:str
39 param1:str
40 param2:str
41 param3:str
42 order:int = -1
43 p1list:list
44 p2list:list
45 p3list:list
46 def __init__(self, title:str, param1:str="", param2:str="", param3:str="", order:int=-1):
47 self.titletitle=title
49 self.param1param1=param1
50 self.param2param2=param2
51 self.param3param3=param3
52 self.orderorder= order
56
58 """
59 Transform object for modifying source field value
60
61 A transform contains a sequence of operations that may include
62 conditional testing of values and referenced fields. It operates on the
63 field whose name is the title of the transform.
64 This field may be a source data
65 field or an enrichment field added to the output record.
66 """
67
68 title:str
69 ops:list
70
71 def __init__(self, title:str):
72 self.titletitle=title
73 self.opsops=[]
74
75 def get_json(self, add_lf:bool, add_quote:bool):
76 """
77 Get JSON string of transform properties
78
79 Constructs a JSON string of the transform.
80 add_lf: whether to add line feed at ends of each JSON property (default false)
81 add_quote: whether to enclose keys and values in double quotes (default false)
82 """
83
84 lf:str = ""
85 dq:str = ""
86 result:str = ""
87 lc:str = "{"
88 rc:str = "}"
89 c:str = ","
90
91 try:
92 if add_lf:
93 lf= "\n"
94
95 if add_quote:
96 dq= "\""
97 result= lc + dq + "transform" + dq + ":["
98 result += lc + dq + "title" + dq + ":" + dq + self.titletitle + dq + rc + lf
99 result += c + lc + dq + "ops" + dq + ":[" + lf
100 for i in range(len(self.opsops)):
101 if i > 0:
102 result += c
103
104 result += lc + dq + "title" + dq + ":" + dq + self.opsops[i].title + dq
105 result += c + dq + "order" + dq + ":" + dq + str(self.opsops[i].order) + dq
106 result += c + dq + "param1" + dq + ":" + dq + self.opsops[i].param1 + dq
107 result += c + dq + "param2" + dq + ":" + dq + self.opsops[i].param2 + dq
108 result += c + dq + "param3" + dq + ":" + dq + self.opsops[i].param3 + dq
109 result += rc + lf
110
111 result += "]" + rc + lf # close ops
112 result += "]" + lf # close transform
113 result += rc+ lf # close object
114 except RuntimeError as err:
115 result="error:"+ str(err)
116 return result
117
118
119def read_transforms_from_file(file_uri:str) -> list:
120 """
121 Read transform file. Reads text file containing JSON formatted specification of transforms.
122 file_uri: file must exist and be accessible. Otherwise, returned title will begin with notok: and have error message
123 Returns transforms list of Transform objects. If error occurs then transforms[0].title will start with notok:
124 """
125 err_msg:str=""
126 jstr:str=""
127 dq:str="\""
128 curstr:str=""
129 tarray:list=[]
130 oparray:list=[]
131 tobj:Transform
132 opobj:Op
133 transforms:list=[]
134 hash_transforms:dict={}
135 try:
136 file_uri=file_uri.strip()
137 if len(file_uri)==0:
138 raise ValueError("missing file_uri")
139
140 with open(file_uri,"r",encoding="utf-8") as f:
141 jstr=f.read()
142 jstr=jstr.replace("\r\n","").replace("\r","").replace("\n","")
143 jstr=jstr.replace(dq,"")
144 if "{transform:[" not in jstr:
145 raise ValueError("no transforms found in read file lines")
146 tarray= jstr.split("{transform:[")
147 for s in tarray:
148 jstr=s
149 if jstr.startswith(","):
150 jstr=jstr[1:]
151 if jstr.endswith(","):
152 jstr=jstr[:-1]
153 if jstr.endswith("}"):
154 jstr=jstr[:-1]
155 if jstr.endswith("]"):
156 jstr=jstr[:-1]
157 if "{title:" in jstr:
158 jstr=jstr[jstr.find("{title:")+7:]
159 curstr=jstr[:jstr.find("}")]
160 curstr=curstr.strip()
161 jstr=jstr[jstr.find("}")+1:]
162 if jstr.startswith(","):
163 jstr=jstr[1:]
164 if len(curstr)==0:
165 raise ValueError("transform title is empty")
166 if curstr.lower() in hash_transforms:
167 raise ValueError("duplicate transform title: " + curstr.lower())
168 tobj=Transform(curstr)
169 if "{ops:[" in jstr:
170 jstr=jstr[jstr.find("{ops:[")+6:]
171 if jstr.endswith(","):
172 jstr=jstr[:-1]
173 if jstr.endswith("}"):
174 jstr=jstr[:-1]
175 if jstr.endswith("]"):
176 jstr=jstr[:-1]
177 oparray.clear()
178 if "{title:" in jstr:
179 oparray= jstr.split("{title:")
180 for s1 in oparray:
181 if ",order:" in s1:
182 curstr=s1[:s1.find(",order:")]
183 jstr= s1[s1.find(",order:")+1:]
184 if jstr.endswith(","):
185 jstr=jstr[:-1]
186 if jstr.endswith("}"):
187 jstr=jstr[:-1]
188 if jstr.startswith(","):
189 jstr=jstr[1:]
190 if len(curstr)>0:
191 opobj=Op(curstr)
192 tobj.ops.append(opobj)
193 if "order:" in jstr:
194 curstr=jstr[jstr.find("order:")+6:]
195 curstr=curstr[:curstr.find(",")]
196 if numfuncs.is_int(curstr):
197 opobj.order= int(curstr)
198 if "param1:" in jstr:
199 curstr=jstr[jstr.find("param1:")+7:]
200 if ",param" in curstr:
201 curstr=curstr[:curstr.find(",param")]
202 opobj.param1=curstr
203 if "param2:" in jstr:
204 curstr=jstr[jstr.find("param2:")+7:]
205 if ",param" in curstr:
206 curstr=curstr[:curstr.find(",param")]
207 opobj.param2=curstr
208 if "param3:" in jstr:
209 curstr=jstr[jstr.find("param3:")+7:]
210 opobj.param3=curstr
211 hash_transforms[tobj.title.lower()]= len(transforms)
212 transforms.append(tobj)
213 except (RuntimeError, ValueError, OSError) as err:
214 err_msg= "notok:" + str(err)
215 transforms.clear()
216 transforms.append(Transform(err_msg))
217 return transforms
218
219
220def write_transforms_to_file(file_uri:str, transforms:list) -> str:
221 """
222 Write transform file
223
224 Writes text file containing JSON formatted specification of transforms.
225 file_uri: file must exist and be accessible.
226 transforms: list of Transform object to write to file in JSON
227 Returns message that starts with notok: if error occurs.
228 """
229 err_msg:str=""
230 delim:str=""
231 try:
232 file_uri=file_uri.strip()
233 if len(file_uri)==0:
234 raise ValueError("missing file_uri")
235 if len(transforms)==0:
236 raise ValueError("missing transforms")
237
238 with open(file_uri, "w", encoding="utf-8") as f:
239 for i in range(len(transforms)):
240 if not isinstance(transforms[i], Transform):
241 raise ValueError("object in transforms list is not type=Transform")
242 if i>0:
243 delim=","
244 else:
245 delim=""
246 f.write(delim + transforms[i].get_json(True,True))
247 except (RuntimeError, ValueError, OSError) as err:
248 err_msg= "notok:" + str(err)
249 return err_msg
250
251
252def extract_lookup_titles(transforms:list) -> list:
253 """
254 Extract list of lookup dict titles used in transforms
255 transforms: list of Transform objects
256 Returns list of lookup dict titles. If error occurs, 0th entry will have
257 title starting with notok:
258 """
259
260 lkupdicts:list=[]
261 hash_lkupdicts:dict={}
262 parm_str:str
263 try:
264 if len(transforms)==0:
265 return lkupdicts
266 for t in transforms:
267 if not isinstance(t, Transform):
268 raise ValueError("object in transforms list is not type=Transform")
269 for op in t.ops:
270 if not isinstance(op, Op):
271 raise ValueError("object in transform " + t.title + " Ops is not type=Op")
272 if op.title.lower().startswith("lookup"):
273 parm_str= op.param1.lower()
274 if len(parm_str)>0 and parm_str not in hash_lkupdicts:
275 lkupdicts.append(parm_str)
276 hash_lkupdicts[parm_str]= len(lkupdicts)-1
277 except (RuntimeError, ValueError, OSError) as err:
278 lkupdicts.insert(0,"notok:" + str(err))
279 return lkupdicts
280
281
282def set_lookup_fields(transforms:list) -> list:
283 """
284 Finds fields set in param2,param3 of tranform Ops for lookup
285 transforms: list of Transform objects
286 Returns new transform collection with lookup Ops having fields in
287 param2,param3 extracted into p2list,p3list.
288 """
289
290 parm_str:str
291 fld:str=""
292 atemp:list=[]
293 try:
294 if len(transforms)==0:
295 return transforms
296 for t in transforms:
297 if not isinstance(t, Transform):
298 raise ValueError("object in transforms list is not type=Transform")
299 for op in t.ops:
300 if not isinstance(op, Op):
301 raise ValueError("object in transform " + t.title + " Ops is not type=Op")
302 if op.title.lower().startswith("lookup"):
303 parm_str= op.param2.lower()
304 if len(parm_str)>0:
305 atemp= parm_str.split("|")
306 for s in atemp:
307 fld=s.strip()
308 if len(fld)>0:
309 op.p2list.append(fld)
310 parm_str= op.param3.lower()
311 if len(parm_str)>0:
312 atemp= parm_str.split("|")
313 for s in atemp:
314 fld=s.strip()
315 if len(fld)>0:
316 op.p3list.append(fld)
317 except (RuntimeError, ValueError, OSError):
318 pass
319 return transforms
320
321
322def extract_refs(transforms:list) -> dict:
323 """
324 Extract referenced fields used in transforms
325 transforms: list of Transform objects
326 Returns dictionary with keys= field titles and thier values=number instances used. If error occurs, there will be a
327 key starting with notok:<error reason>
328 """
329
330 hash_fields:dict={}
331 parm_str:str=""
332 parm_str2:str=""
333 optitle:str=""
334 try:
335 if len(transforms)==0:
336 return hash_fields
337 for t in transforms:
338 if not isinstance(t, Transform):
339 raise ValueError("object in transforms list is not type=Transform")
340 for op in t.ops:
341 optitle= op.title.lower()
342 if not isinstance(op, Op):
343 raise ValueError("object in transform " + t.title + " Ops is not type=Op")
344 if optitle.endswith("ref") or optitle.endswith("refs"):
345 parm_str= op.param1.lower()
346 if len(parm_str)>0:
347 if parm_str not in hash_fields:
348 hash_fields[parm_str]= 0
349 hash_fields[parm_str] += 1
350 if optitle.endswith("refs"):
351 parm_str= op.param2.lower()
352 if len(parm_str)>0:
353 if parm_str not in hash_fields:
354 hash_fields[parm_str]= 0
355 hash_fields[parm_str] += 1
356 parm_str= op.param3.lower()
357 if len(parm_str)>0:
358 if parm_str not in hash_fields:
359 hash_fields[parm_str]= 0
360 hash_fields[parm_str] += 1
361 elif optitle=="settofreqlist":
362 parm_str= op.param3.lower()
363 if len(parm_str)>0:
364 if parm_str not in hash_fields:
365 hash_fields[parm_str]= 0
366 hash_fields[parm_str] += 1
367 elif optitle=="lookup":
368 parm_str= op.param2.lower()
369 if len(parm_str)>0:
370 parm_str2=""
371 if "|" in parm_str:
372 parm_str2= parm_str[(parm_str.find("|")+1):]
373 parm_str= parm_str[:parm_str.find("|")]
374 if parm_str not in hash_fields:
375 hash_fields[parm_str]= 0
376 hash_fields[parm_str] += 1
377 if len(parm_str2)>0:
378 if parm_str2 not in hash_fields:
379 hash_fields[parm_str2]= 0
380 hash_fields[parm_str2] += 1
381 except (RuntimeError, ValueError, OSError) as err:
382 hash_fields["notok:" + str(err)]=0
383 return hash_fields
__init__(self, str title, str param1="", str param2="", str param3="", int order=-1)
Definition transform.py:46
get_json(self, bool add_lf, bool add_quote)
Definition transform.py:75
str write_transforms_to_file(str file_uri, list transforms)
Definition transform.py:220
list extract_lookup_titles(list transforms)
Definition transform.py:252
dict extract_refs(list transforms)
Definition transform.py:322
list read_transforms_from_file(str file_uri)
Definition transform.py:119
list set_lookup_fields(list transforms)
Definition transform.py:282