-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsa_3_1.py
44 lines (37 loc) · 1.72 KB
/
sa_3_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from databaker.framework import *
import sa_walk
import feather
import re
import output
table = "3.1"
files = [x for x in sa_walk.SA_files() if x.table==table]
for year, _, filename in files:
YEAR = int(year)
# path=f"data/raw/South Africa Staff Tables_raw/{YEAR} Staff tables for Universities/{YEAR} Table 3.3 for Universities.xls"
for i, tab in enumerate(loadxlstabs(filename, "*", verbose = False)):
institution = tab.filter_one(re.compile("INSTITUTION: .*")).value
print (institution)
# institution = tab.filter_one(lambda cell: cell.x==0 and cell.y==1).value
# TODO -- varies from A1 to A2 across 2001/2017
if not institution: # Further tidying required -- INSTITUTION: H09 (UNIVERSITY OF LIMPOPO)
institution = "National"
else:
institution = institution.split(" ")[-1].strip()
print (institution)
anchor = tab.filter_one("PERSONNEL CATEGORY")
programme = anchor.fill(DOWN).is_not_blank()
profcat = anchor.fill(RIGHT).is_not_blank()
observations = programme.fill(RIGHT).is_not_blank()
dimensions = [
HDimConst(DATAMARKER, 0),
HDimConst("Institution", institution),
HDim(programme, "Programme", DIRECTLY, LEFT),
HDim(profcat, "Personnel Category", DIRECTLY, ABOVE),
HDimConst(TIME, YEAR)
]
c1 = ConversionSegment(observations, dimensions)
df = c1.topandas()
ins = institution.replace(":", "_").replace("/", "_").replace("\\", "_")
feather.write_dataframe(df, f"feather/{YEAR}-{ins}-{table}.feather") # TODO won't get all!
df.to_csv(f"csv/{YEAR}-{ins}-{table}.csv", header=True)
print(c1.topandas())