forked from aeturrell/occupationcoder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_occupationcoder.py
204 lines (178 loc) · 7.24 KB
/
test_occupationcoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python
"""Tests for `occupationcoder` package."""
import unittest
import time
import sys
import os
import subprocess
import pandas as pd
from occupationcoder import coder, cleaner
SAMPLE_SIZE = 100000
class TestOccupationcoder(unittest.TestCase):
"""Tests for `occupationcoder` package."""
def setUp(self):
"""Set up test fixtures, if any."""
# The expected cleaned titles to our test data
self.expected_titles = ["physicist", "economist", "ground worker"]
# The SOC codes that TFIDF is expected to suggest for three examples
self.expected_codes = [
"242",
"311",
"212",
"215",
"211",
"353",
"412",
"215",
"242",
"211",
"242",
"533",
"243",
"912",
"323",
]
# Load the three example records
self.test_df = pd.read_csv(os.path.join("tests", "test_vacancies.csv"))
# Instantiate matching class
self.matcher = coder.Coder(scheme="soc", output="single")
self.isco_matcher = coder.Coder(scheme="isco")
self.cl = cleaner.Cleaner()
def tearDown(self):
"""Tear down test fixtures, if any."""
def test_clean_titles(self):
"""Checks that results of title cleaning are as expected"""
clean_titles = self.test_df["job_title"].apply(self.cl.simple_clean)
for title in clean_titles:
self.assertIn(title, self.expected_titles)
def test_code_exact_matcher(self):
"""Results of exact title matching"""
clean_titles = self.test_df["job_title"].apply(self.cl.simple_clean)
matches = clean_titles.apply(self.matcher.get_exact_match)
for match in matches:
self.assertIn(match, ["211", "242", None])
def test_isco_exact_match(self):
"""Tests the ability for coder to access ISCO dictionaries correctly"""
clean_titles = self.test_df["job_title"].apply(self.cl.simple_clean)
matches = clean_titles.apply(self.isco_matcher.get_exact_match)
for match in matches:
self.assertIn(match, ["2111", "2631", None])
def test_code_tfidf_matcher(self):
"""TF-IDF similarity suggestions for categories?"""
df = self.test_df.copy()
df["clean_title"] = df["job_title"].apply(self.cl.simple_clean)
df["clean_sector"] = df["job_sector"].apply(
lambda x: self.cl.simple_clean(x, known_only=False)
)
df["clean_desc"] = df["job_description"].apply(
lambda x: self.cl.simple_clean(x, known_only=False)
)
for index, row in df.iterrows():
clean = " ".join(
[row["clean_title"], row["clean_sector"], row["clean_desc"]]
)
SOC_codes = self.matcher.get_tfidf_match(clean)
for code in SOC_codes:
self.assertIn(code, self.expected_codes)
def test_code_record(self):
"""Confirm it correctly runs on our example single record"""
result = self.matcher.code_record(
title="Physicist",
sector="Professional scientific",
description="Calculations of the universe",
)
self.assertEqual(result, "211")
def test_code_data_frame(self):
"""Running the included examples from a file."""
df = pd.read_csv(os.path.join("tests", "test_vacancies.csv"))
df = self.matcher.code_data_frame(
df,
title_column="job_title",
sector_column="job_sector",
description_column="job_description",
)
self.assertEqual(df["SOC_code"].to_list(), ["211", "242", "912"])
def test_multi_code_output(self):
"""Running samples from file and getting codes and scores out using ISCO"""
df = pd.read_csv(os.path.join("tests", "test_vacancies.csv"))
df = self.isco_matcher.code_data_frame(
df,
title_column="job_title",
sector_column="job_sector",
description_column="job_description",
)
self.assertEqual(df["prediction 1"].to_list(), ["2111", "2631", "3333"])
# def test_parallel_code_data_frame(self):
# """
# Running the included examples from a file.
# DISABLED because it can't be run through testr, parallel testing
# interferes with parallelism in code
# """
# df = pd.read_csv(os.path.join('tests', 'test_vacancies.csv'))
# df = self.matcher.parallel_code_data_frame(
# df,
# title_column="job_title",
# sector_column="job_sector",
# description_column="job_description"
# )
# self.assertEqual(df['SOC_code'].to_list(), ['211', '242', '912'])
def test_command_line(self):
"""Test code execution at command line"""
# sys.executable returns current python executable, ensures code is run
# in same environment from which tests are called
subprocess.run(
[
sys.executable,
"-m",
"occupationcoder.coder",
"--in_file=tests/test_vacancies.csv",
"--scheme=soc",
"--output=single",
]
)
df = pd.read_csv(
os.path.join("occupationcoder", "outputs", "processed_jobs.csv")
)
self.assertEqual(df["SOC_code"].to_list(), [211, 242, 912])
def manual_load_test(self):
"""
Look at execution speed.
On test machine: 50k short records in ~308s.
Does not execute as part of automated tests.
"""
# Multiply up that dataset to many, many rows so we can test time taken
big_df = self.test_df.sample(SAMPLE_SIZE, replace=True, ignore_index=True)
print("Size of test dataset: {}".format(big_df.shape[0]))
# Time only the actual code assignment process
proc_tic = time.perf_counter()
_ = self.matcher.code_data_frame(
big_df,
title_column="job_title",
sector_column="job_sector",
description_column="job_description",
)
print(_.shape)
print(_[["job_title", "SOC_code"]].head(5))
proc_toc = time.perf_counter()
print("Coding process ran in: {}".format(proc_toc - proc_tic))
def manual_parallel_load_test(self):
"""
Look at execution speed of parallel implementation.
On test machine: 100k short records in ~160s.
Does not execute as part of automated tests.
"""
# Multiply up that dataset to many, many rows so we can test time taken
big_df = self.test_df.sample(SAMPLE_SIZE, replace=True, ignore_index=True)
print("Size of test dataset: {}".format(big_df.shape[0]))
# Time only the actual code assignment process
proc_tic = time.perf_counter()
_ = self.matcher.parallel_code_data_frame(
big_df,
title_column="job_title",
sector_column="job_sector",
description_column="job_description",
)
print(_.shape)
print(_[["job_title", "SOC_code"]].head(5))
proc_toc = time.perf_counter()
print("Coding process ran in: {}".format(proc_toc - proc_tic))