-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv2json.py
157 lines (118 loc) · 4.04 KB
/
csv2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from collections import defaultdict
import csv
import json
import sys
import time
import re
import configparser
from distutils.util import strtobool
try:
config = configparser.ConfigParser()
config.read('config.ini')
rootColumn=config['DEFAULT']['rootColumn']
parentColumn=config['DEFAULT']['parentColumn']
nodeColumn=config['DEFAULT']['nodeColumn']
subItemEscapeChar=config['DEFAULT']['subItemEscapeChar']
simpleListEscapeChar=config['DEFAULT']['simpleListEscapeChar']
booleanEscapeChar=config['DEFAULT']['booleanEscapeChar']
except:
print("Cannot read config file")
sys.exit(0)
if len(sys.argv)<2:
print("Usage: csv2json.py <file.csv>")
sys.exit(0)
elif len(sys.argv)==2:
file = sys.argv[1]
csvData=[]
f = open(file, "r")
reader = csv.reader(f)
for row in reader:
csvData.append(row)
fields = csvData[0]
try:
parentIndex=fields.index(parentColumn)
nodeIndex=fields.index(nodeColumn)
except:
print("Cannot find parent column or the node column")
sys.exit(0)
# contstruct list of parents
parents = defaultdict(list)
attributes = {}
attributeTypes = {}
for row in csvData:
parents[row[parentIndex]].append(row)
for field in fields:
if row[fields.index(field)].find(subItemEscapeChar)== 0:
if row[fields.index(field)].find(subItemEscapeChar+"list"+subItemEscapeChar)> 0:
attributeTypes[row[nodeIndex]] ="list"
elif row[fields.index(field)].find(subItemEscapeChar+"dict"+subItemEscapeChar)> 0:
attributeTypes[row[nodeIndex]] ="dict"
attributes[row[nodeIndex]]= row[fields.index(field)].replace(subItemEscapeChar+"list"+subItemEscapeChar,"")
attributes[row[nodeIndex]]= attributes[row[nodeIndex]].replace(subItemEscapeChar+"dict"+subItemEscapeChar,"")
attributes[row[nodeIndex]]= attributes[row[nodeIndex]].replace(subItemEscapeChar,"")
#print(parents)
#print(attributes)
#print(attributeTypes)
def buildtree(t=None, parent=''):
#Get all the children rows for a given parent
rows = parents.get(parent, None)
attributesLabel=""
attrType=""
try:
attributesLabel = attributes[parent]
attrType = attributeTypes[parent]
except:
pass
if rows is None:
return t
for row in rows:
node = {}
for field in fields:
if row[fields.index(field)] !="":
if row[fields.index(field)].find(subItemEscapeChar)== 0:
pass
else:
if row[fields.index(field)].find(simpleListEscapeChar)== 0:
node[field]=json.loads(row[fields.index(field)].replace(simpleListEscapeChar,""))
elif row[fields.index(field)].find(booleanEscapeChar)== 0:
node[field]=bool(strtobool(row[fields.index(field)].replace(booleanEscapeChar,"")))
else:
node[field]=row[fields.index(field)]
if t is None:
t = node
else:
#print(attrType,len(rows) )
if attributesLabel =="":
print("there are sub elements that don't have a parent, this script will fail to generate the proper result")
elif attrType =="dict" and len(rows) ==1:
t[attributesLabel]= node
else:
children = t.setdefault(attributesLabel, [])
children.append(node)
buildtree(node, node[nodeColumn])
return t
data = buildtree()
def stripParentNode(data):
if isinstance(data, dict):
return {k:stripParentNode(v) for k, v in data.items() if k != nodeColumn and k != parentColumn}
elif isinstance(data, list):
return [stripParentNode(item) for item in data if item is not None]
elif isinstance(data, tuple):
return tuple(stripParentNode(item) for item in data if item is not None)
elif isinstance(data, set):
return {stripParentNode(item) for item in data if item is not None}
else:
return data
data = stripParentNode(data)
#if the final json has the rootColumn, that means the original json contains an array, so we need to string out that"
try:
data = data[rootColumn]
except:
pass
outrFile = file+'_'+ str(int(time.time()))+'.json'
def redirect_to_file(text, outrFile):
original = sys.stdout
sys.stdout = open(outrFile, 'w')
print(text)
sys.stdout = original
redirect_to_file(json.dumps(data, sort_keys=False, indent=2, separators=(',', ': ')), outrFile)