-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathjson_to_csv.py
113 lines (87 loc) · 4.28 KB
/
json_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# pylint: disable=invalid-name
"""
Checks for a single json or a folder of JSON and assuming humanfirst json format
Assembles them into a single CSV contains the correct "examples" information
Optionally search through for a set of keys from a single column - i.e
if trying to find certain call numbers
python ./json_to_csv.py
-f <YOUR FILENAME or FOLDER>
"""
# *********************************************************************************************************************
# standard imports
import os
# third party imports
import click
import pandas
import json
# custom imports
@click.command()
@click.option('-f', '--input_filename', type=str, required=True, help='Input File or Folder')
@click.option('-m', '--max_files', type=int, required=False, default=0, help='Maximum number of files to run on')
@click.option('-s', '--search_col', type=str, required=False, default="", help='Optional column to build a separate CSV of')
@click.option('-v', '--search_values', type=str, required=False, default="", help='Comma delimited set of values to run on')
def main(input_filename: str, max_files: int,
search_col: str, search_values: str):
"""In the case of a folder will run for all files in that folder"""
# check if file or folder and process loop round
if os.path.isdir(input_filename):
list_files = os.listdir(input_filename)
json_files = []
for f in list_files:
if f.endswith(".json"):
json_files.append(os.path.join(input_filename,f))
if len(json_files) == 0:
raise RuntimeError(f"No json files to process in {input_filename}")
else:
print(f"JSON number of files to process: {len(json_files)}")
df = pandas.DataFrame()
for i,jf in enumerate(json_files):
if max_files > 0 and i >= max_files:
print(f'Max files reached')
break
print(f'{i:03} Beginning work on {jf}')
df = pandas.concat([df,process_file(jf)])
output_filename = os.path.join(input_filename,"collated_output.csv")
elif os.path.isfile(input_filename):
df = process_file(input_filename)
output_filename = input_filename.replace(".json", "_output.csv")
else:
raise RuntimeError(f"This string does not appear to be a file or folder: {input_filename}")
# print
print(df)
# Do any searches
if search_col != "":
if search_values == "":
raise RuntimeError(f'If search_col is provided at least one search value must be passed')
list_cols = df.columns.to_list()
if not search_col in list_cols:
raise RuntimeError(f"Can\'t find {search_col} in: {','.join(list_cols)}")
print(f'Failed to find ')
# get search values
search_values = search_values.split(",")
assert isinstance(search_values,list)
assert len(search_values) >= 1
print(f'Search values contains this many values to search for: {len(search_values)}')
# do the search
df_values = search_for_values(df,search_col,search_values)
# check the results
if df_values["context.context_id"].nunique() == len(search_values):
print("Found all values")
else:
print(f"Found: {df_values['context.context_id'].unique()} out of: {search_values}")
print(df_values)
write_output(df_values,input_filename,output_filename.replace(".csv","_searched.csv"))
write_output(df,input_filename,output_filename)
def search_for_values(df: pandas.DataFrame, search_col: str, search_values: list) -> pandas.DataFrame:
return df[df[search_col].isin(search_values)].copy(deep=True)
def process_file(input_filename: str) -> pandas.DataFrame:
file = open(input_filename, mode = "r", encoding = "utf8")
workspace_json = json.load(file)
file.close()
return pandas.json_normalize(workspace_json["examples"])
def write_output(df: pandas.DataFrame, input_filename: str, output_filename: str):
assert input_filename != output_filename
df.to_csv(output_filename,index=False, header=True)
print(f'Wrote to: {output_filename}')
if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter