-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcsv_analyzer.py
executable file
·491 lines (418 loc) · 19.5 KB
/
csv_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
#! /usr/bin/env python3
import os
import sys
import argparse
import configparser
import signal
import time
from datetime import datetime, timedelta
import json
import csv
import glob
#----------------------------------------
def pause(question='PRESS ENTER TO CONTINUE ...'):
""" pause for debug purposes """
global shutDown
try: response = input(question)
except KeyboardInterrupt:
response = None
shutDown = True
return response
#----------------------------------------
def signal_handler(signal, frame):
global shutDown
print('USER INTERUPT! Shutting down ... (please wait)')
shutDown = True
return
#----------------------------------------
def getNextRow(fileInfo):
errCnt = 0
csv_data = None
while not csv_data:
#--quit for consecutive errors
if errCnt >= 10:
fileInfo['ERROR'] = 'YES'
print()
print('Shutdown due to too many errors')
break
try: line = next(fileInfo['reader'])
except StopIteration:
break
except:
print(' row %s: %s' % (fileInfo['rowCnt'], sys.exc_info()[0]))
fileInfo['skipCnt'] += 1
errCnt += 1
continue
fileInfo['rowCnt'] += 1
if line: #--skip empty lines
if fileInfo['fieldDelimiter'] == 'JSON':
csv_data = json.loads(line)
for attr in csv_data:
if type(csv_data[attr]) not in (list, dict) and attr not in fileInfo['header']:
fileInfo['header'].append(attr)
return fileInfo, csv_data
#--csv reader will return a list (mult-char delimiter must be manually split)
if type(line) == list:
row = line
else:
row = [removeQuoteChar(x.strip()) for x in line.split(fileInfo['fieldDelimiter'])]
#--turn into a dictionary if there is a header
if 'header' in fileInfo:
#--column mismatch
if len(row) != len(fileInfo['header']):
print(' row %s has %s columns, expected %s' % (fileInfo['rowCnt'], len(row), len(fileInfo['header'])))
fileInfo['skipCnt'] += 1
errCnt += 1
continue
#--is it the header row
elif str(row[0]).upper() == fileInfo['header'][0].upper() and str(row[len(row)-1]).upper() == fileInfo['header'][len(fileInfo['header'])-1].upper():
fileInfo['skipCnt'] += 1
if fileInfo['rowCnt'] != 1:
print(' row %s contains the header' % fileInfo['rowCnt'])
errCnt += 1
continue
#--return a good row
else:
csv_data = dict(zip(fileInfo['header'], [str(x).strip() for x in row]))
else: #--if not just return what should be the header row
fileInfo['skipCnt'] += 1
csv_data = [str(x).strip() for x in row]
else:
print(' row %s is blank' % fileInfo['rowCnt'])
fileInfo['skipCnt'] += 1
continue
return fileInfo, csv_data
#----------------------------------------
def removeQuoteChar(s):
if len(s)>2 and s[0] + s[-1] in ("''", '""'):
return s[1:-1]
return s
#----------------------------------------
def analyzeFile():
""" analyze a csv file """
global shutDown
statPack = {}
totalRowCnt = 0
#--get parameters from a mapping file
mappingDoc = {'input': {}}
if mappingFileName and os.path.exists(mappingFileName):
try: mappingDoc = json.load(open(mappingFileName, 'r'))
except ValueError as err:
print('')
print('mapping file error: %s in %s' % (err, mappingFileName))
return 1
#--override mapping document with parameters
if inputFileName or 'inputFileName' not in mappingDoc['input']:
mappingDoc['input']['inputFileName'] = inputFileName
if fieldDelimiter or 'fieldDelimiter' not in mappingDoc['input']:
mappingDoc['input']['fieldDelimiter'] = fieldDelimiter
if fileEncoding or 'fileEncoding' not in mappingDoc['input']:
mappingDoc['input']['fileEncoding'] = fileEncoding
if 'columnHeaders' not in mappingDoc['input']:
mappingDoc['input']['columnHeaders'] = []
#--get the input file
if not mappingDoc['input']['inputFileName']:
print('')
print('no input file supplied')
return 1
fileList = glob.glob(mappingDoc['input']['inputFileName'])
if len(fileList) == 0:
print('')
print('%s not found' % inputFileName)
return 1
#--need a test record for python module
testRecord = None
#--for each input file
for fileName in fileList:
print('')
print('Analyzing %s ...' % fileName)
currentFile = {}
currentFile['name'] = fileName
currentFile['rowCnt'] = 0
currentFile['skipCnt'] = 0
#--open the file
if mappingDoc['input']['fileEncoding']:
currentFile['fileEncoding'] = mappingDoc['input']['fileEncoding']
currentFile['handle'] = open(fileName, 'r', encoding=mappingDoc['input']['fileEncoding'])
else:
currentFile['handle'] = open(fileName, 'r')
#--set the dialect
currentFile['fieldDelimiter'] = mappingDoc['input']['fieldDelimiter']
if not mappingDoc['input']['fieldDelimiter']:
sniffer = csv.Sniffer().sniff(currentFile['handle'].readline(), delimiters='|,\t')
currentFile['handle'].seek(0)
currentFile['fieldDelimiter'] = sniffer.delimiter
mappingDoc['input']['fieldDelimiter'] = sniffer.delimiter
if mappingDoc['input']['fieldDelimiter'].lower() in ('csv', 'comma', ','):
currentFile['csvDialect'] = 'excel'
elif mappingDoc['input']['fieldDelimiter'].lower() in ('tab', 'tsv', '\t'):
currentFile['csvDialect'] = 'excel-tab'
elif mappingDoc['input']['fieldDelimiter'].lower() in ('pipe', '|'):
csv.register_dialect('pipe', delimiter = '|', quotechar = '"')
currentFile['csvDialect'] = 'pipe'
elif len(mappingDoc['input']['fieldDelimiter']) == 1:
csv.register_dialect('other', delimiter = delimiter, quotechar = '"')
currentFile['csvDialect'] = 'other'
elif len(mappingDoc['input']['fieldDelimiter']) > 1:
currentFile['csvDialect'] = 'multi'
else:
currentFile['csvDialect'] = 'excel'
mappingDoc['input']['csvDialect'] = currentFile['csvDialect']
#--set the reader (csv cannot be used for multi-char delimiters)
if currentFile['csvDialect'] != 'multi':
currentFile['reader'] = csv.reader(currentFile['handle'], dialect=currentFile['csvDialect'])
else:
currentFile['reader'] = currentFile['handle']
#--get the current file header row and use it if not one already
currentFile, currentHeaders = getNextRow(currentFile)
if not mappingDoc['input']['columnHeaders']:
mappingDoc['input']['columnHeaders'] = [str(x).replace(' ', '_') for x in currentHeaders]
currentFile['header'] = mappingDoc['input']['columnHeaders']
#--initialize the statpack first time through
if not statPack:
colNum = 0
errCnt = 0
for columnName in mappingDoc['input']['columnHeaders']:
colNum += 1
if not columnName:
print(' column %s header is blank!' % colNum)
errCnt += 1
if columnName not in statPack:
statPack[columnName] = {'null': 0}
if errCnt:
print()
print('Row 1 does not contain a valid column header!')
currentFile['handle'].close
return 1
#--process the rows in the input file
currentFile, rowData = getNextRow(currentFile)
while rowData:
totalRowCnt += 1
if not testRecord:
testRecord = rowData
if pythonMapperClass:
rowData = pythonMapperClass.process(rowData)
print(rowData)
#--for each column
for columnName in currentFile['header']:
if columnName not in rowData: #--may not be if json
statPack[columnName]['null'] += 1
continue
if columnName not in statPack: #--may not be if json
statPack[columnName] = {'null': 0}
if columnName not in mappingDoc['input']['columnHeaders']:
mappingDoc['input']['columnHeaders'].append(columnName) #--may not be if json
columnValue = str(rowData[columnName]).strip()
if not columnValue or columnValue.strip().upper() in ('NONE', 'NULL', '\\N'):
statPack[columnName]['null'] += 1
continue
if columnValue in statPack[columnName]:
statPack[columnName][columnValue] += 1
else:
statPack[columnName][columnValue] = 1
currentFile, rowData = getNextRow(currentFile)
#--break conditions
if shutDown:
break
elif 'ERROR' in currentFile:
break
if currentFile['rowCnt'] % 10000 == 0:
print(' %s records processed' % currentFile['rowCnt'])
currentFile['handle'].close()
if shutDown:
break
else:
print(' %s records processed, complete!' % currentFile['rowCnt'])
#--export the analysis
if outputFileName:
try: outputFileHandle = open(outputFileName, 'w', newline='')
except IOError as err:
print()
print('Could not write to %s' % outputFileName)
return 1
outputFileWriter = csv.writer(outputFileHandle, dialect=csv.excel, quoting=csv.QUOTE_ALL)
bestRecordID = '<remove-or-supply>'
possibleMappings = []
columnHeaders = "columnName,recordCount,percentPopulated,uniqueCount,uniquePercent,topValue1,topValue2,topValue3,topValue4,topValue5"
if outputFileName:
outputFileWriter.writerow(columnHeaders.split(','))
else:
print(columnHeaders)
for columnName in mappingDoc['input']['columnHeaders']:
recordCount = totalRowCnt - statPack[columnName]['null']
percentPopulated = round(recordCount / totalRowCnt * 100, 2)
uniqueCount = len(statPack[columnName]) - 1
uniquePercent = round(uniqueCount / recordCount * 100, 2) if recordCount else 0
#--first 100% unique field is recordID
if uniqueCount == totalRowCnt and not bestRecordID.startswith('%'):
bestRecordID = columnName
topValue = []
for value in sorted(statPack[columnName].items(), key=lambda x: x[1], reverse=True):
if value[0] != 'null':
topValue.append('%s (%s)' % value)
if len(topValue) == 5:
break
while len(topValue) < 5:
topValue.append('')
#--create possible mapping
if recordCount:
columnMapping = {}
columnMapping['attribute'] = '<ignore>'
columnMapping['mapping'] = '%(' + columnName + ')s'
columnMapping['statistics'] = {}
columnMapping['statistics']['columnName'] = columnName
columnMapping['statistics']['populated%'] = percentPopulated
columnMapping['statistics']['unique%'] = uniquePercent
columnMapping['statistics']['top5values'] = [item for item in topValue if item]
possibleMappings.append(columnMapping)
rowData = (columnName, recordCount, percentPopulated, uniqueCount, uniquePercent, topValue[0], topValue[1], topValue[2], topValue[3], topValue[4])
if outputFileName:
outputFileWriter.writerow(rowData)
else:
print('"%s", %s, %s, %s, %s, "%s", "%s", "%s", "%s", "%s"' % rowData)
#--close the output file
if outputFileName:
outputFileHandle.close()
print('\nStatistics written to %s' % outputFileName)
#--create or update the mapping file if provided
if mappingFileName:
if not mappingDoc['input']['fieldDelimiter']:
del mappingDoc['input']['fieldDelimiter']
if not mappingDoc['input']['fileEncoding']:
del mappingDoc['input']['fileEncoding']
if 'calculations' not in mappingDoc:
mappingDoc['calculations'] = []
if 'outputs' not in mappingDoc:
mappingDoc['outputs'] = []
outputDoc = {}
outputDoc['data_source'] = '<supply>'
outputDoc['record_type'] = 'GENERIC'
outputDoc['record_id'] = '<remove_or_supply>'
outputDoc['attributes'] = possibleMappings
mappingDoc['outputs'].append(outputDoc)
try:
with open(mappingFileName, 'w') as f:
json.dump(mappingDoc, f, indent=4)
except Exception as e:
print()
print('Could not write to %s' % mappingFileName)
print(e)
print('\nMapping file written to %s' % mappingFileName)
#--create or update the mapping file if provided
if pythonModuleFile:
codeLines = []
with open(templateFile, 'r') as f:
for line in f:
if line.strip() == "input_file = '<input_file_name>'":
codeLines.append(line.replace('<input_file_name>', inputFileName))
elif line.strip() == "csv_dialect = '<dialect>'":
if mappingDoc['input']['csvDialect'] == 'pipe':
registerDialect = "csv.register_dialect('pipe', delimiter = '|')"
elif mappingDoc['input']['csvDialect'] == 'other':
registerDialect = "csv.register_dialect('other', delimiter = '" + mappingDoc['input']['fieldDelimiter'] + "')"
else:
registerDialect = None
if registerDialect:
codeLines.append((' ' * (len(line) - len(line.strip()) -1)) + registerDialect + '\n')
codeLines.append(line.replace('<dialect>', mappingDoc['input']['csvDialect']))
elif line.strip() == "input_file_handle = open(args.input_file, 'r')" and mappingDoc['input']['fileEncoding']:
codeLines.append(line.replace(')', ", encoding='%s')" % mappingDoc['input']['fileEncoding']))
elif line.strip() == "json_data['RECORD_ID'] = '<supply>'":
if not bestRecordID.startswith('<'):
line = line.replace("'<supply>'", "raw_data['%s']" % bestRecordID)
codeLines.append(line)
elif line.strip() == '#--column mappings':
codeLines.append(line)
for columnMapping in possibleMappings:
codeLines.append('\n')
codeLines.append(" # columnName: %s\n" % columnMapping['statistics']['columnName'])
codeLines.append(" # %s populated, %s unique\n" % (columnMapping['statistics']['populated%'], columnMapping['statistics']['unique%']))
for item in columnMapping['statistics']['top5values']:
codeLines.append(" # %s\n" % item)
codeLines.append(" json_data['%s'] = raw_data['%s']\n" % (columnMapping['statistics']['columnName'], columnMapping['statistics']['columnName']))
elif line.strip().startswith('raw_data = {'):
if not testRecord:
codeLines.append(line)
else:
codeLines.append(' raw_data = {}\n')
for key in testRecord:
val = testRecord[key]
if type(val) == str:
val = '"' + val + '"'
codeLines.append(' raw_data["%s"] = %s\n' % (key, val))
else:
codeLines.append(line)
try:
with open(pythonModuleFile, 'w') as f:
for line in codeLines:
f.write(line)
except Exception as e:
print()
print('Could not write to %s' % pythonModuleFile)
print(e)
print('\nPython module written to %s' % pythonModuleFile)
return shutDown
def fileBackup(thisFileName):
for i in range(10):
i = '' if i == 0 else i
backupFileName = thisFileName + '.bk%s' % i
if os.path.exists(backupFileName):
print('\t%s already exists!' % backupFileName)
else:
os.rename(thisFileName, backupFileName)
print('\t%s created' % backupFileName)
return True
return False
#----------------------------------------
if __name__ == "__main__":
procStartTime = time.time()
shutDown = False
signal.signal(signal.SIGINT, signal_handler)
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inputFileName', dest='inputFileName', help='the name of a csv input file')
parser.add_argument('-d', '--fieldDelimiter', dest='fieldDelimiter', help='delimiter character')
parser.add_argument('-e', '--fileEncoding', dest='fileEncoding', help='file encoding')
parser.add_argument('-o', '--outputFileName', dest='outputFileName', help='the name of the output file')
parser.add_argument('-m', '--mappingFileName', dest='mappingFileName', help='optional name of a mapping file to generate')
parser.add_argument('-p', '--pythonModuleFile', dest='pythonModuleFile', help='optional name of a python module file to generate')
args = parser.parse_args()
inputFileName = args.inputFileName
fieldDelimiter = args.fieldDelimiter
fileEncoding = args.fileEncoding
outputFileName = args.outputFileName
mappingFileName = args.mappingFileName
pythonModuleFile = args.pythonModuleFile
if not inputFileName:
print('\nAn input file name is required\n')
sys.exit(1)
if mappingFileName and os.path.exists(mappingFileName):
response = input('\nMapping file already exists!!, overwrite it? (Y/N) ')
if response[0:1].upper() != 'Y':
mappingFileName = None
print('\nmapping file will be preserved!')
elif not fileBackup(mappingFileName):
print('\nAborted, backup failed!\n')
sys.exit(1)
pythonMapperClass = None
if pythonModuleFile and os.path.exists(pythonModuleFile):
response = input('\nPython module already exists!, overwrite it? (Y/N) ')
if response[0:1].upper() != 'Y':
pythonModuleFile = None
print('\nmapping file will be preserved!')
elif not fileBackup(pythonModuleFile):
print('\nAborted, backup failed!\n')
sys.exit(1)
#--make sure the template file exists
if pythonModuleFile:
templateFile = os.path.dirname(__file__) + os.path.sep + 'python_template.py'
if not os.path.exists(templateFile):
print(f'\nCannot find {templateFile}\n')
sys.exit(1)
result = analyzeFile()
print('')
if result != 0:
print('process aborted!')
else:
print('process completed!')
print('')
sys.exit(result)