-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge.py
56 lines (44 loc) · 1.85 KB
/
merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json
# import requests
# from pathlib import Path
# import re
with open('fetch_poslowie_from_sejm_gov_pl.json') as fp:
sejm_content = json.load(fp)
with open('fetch.json') as fp:
target_content = json.load(fp)
def normalize_name(v):
parts = v.split(' ')
return f"{parts[0]} {parts[-1]}"
sejm_name = {normalize_name(x['name']) for x in sejm_content}
target_name = {normalize_name(x['name']) for x in target_content}
assert(len(sejm_name) == len(sejm_content))
assert(len(target_content) == len(target_content))
print('Extra sejm:', sejm_name - target_name)
print('Missing sejm:', target_name - sejm_name)
def find_person(name):
return next(x for x in sejm_content if normalize_name(x['name']) == normalize_name(name))
short_map = {
'Koalicyjny Klub Parlamentarny Lewicy (Nowa Lewica, PPS, Razem, Wiosna Roberta Biedronia)': 'Lewica',
'Poseł niezrzeszony': 'niez.',
}
# def safe_name(x):
# return re.sub('[^A-Za-z]', '_', x)
def gender_by_name(name):
if name.split(' ')[0][-1] == 'a':
return 'K'
return 'M'
for target_person in target_content:
name = target_person['name']
sejm_person = find_person(name)
if sejm_person['partia']['Klub/koło:'] != target_person['club_long']:
print('missmatch club', name, sejm_person['partia']['Klub/koło:'], '!=', target_person['club_long'])
target_person['club_long'] = sejm_person['partia']['Klub/koło:']
target_person['club_short'] = short_map[target_person['club_long']]
target_person['gender'] = gender_by_name(target_person['name'])
# p = Path(f'img/{safe_name(name)}.jpg')
# if not p.exists():
# print('Download img for', p)
# p.write_bytes(requests.get(sejm_person['picute']).content)
# target_person['img'] = str(p)
with open('fetch_merged.json', 'w') as fp:
json.dump(target_content, fp, indent=4, sort_keys=True)