Skip to content

Commit

Permalink
Merge pull request #65 from openaddresses/remove-case-smashing
Browse files Browse the repository at this point in the history
Remove case smashing
  • Loading branch information
iandees authored Nov 17, 2024
2 parents 43a2786 + c224803 commit 20996e1
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 136 deletions.
9 changes: 1 addition & 8 deletions openaddr/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def field_names_to_request(cls, source_config):

fields = set()
for k, v in conform.items():
if k.upper() in source_config.SCHEMA:
if k in source_config.SCHEMA:
if isinstance(v, dict):
# It's a function of some sort?
if 'function' in v:
Expand Down Expand Up @@ -384,8 +384,6 @@ def download(self, source_urls, workdir, source_config):
if GEOM_FIELDNAME not in field_names:
field_names.append(GEOM_FIELDNAME)

field_names = list(map(lambda x: x.upper(), field_names))

# Get the count of rows in the layer
try:
row_count = downloader.get_feature_count()
Expand All @@ -410,11 +408,6 @@ def download(self, source_urls, workdir, source_config):
shp = shape(feature['geometry'])
row[GEOM_FIELDNAME] = shp.wkt

r = dict()
for k,v in row.items():
r[k.upper()] = v
row = r

writer.writerow({fn: row.get(fn) for fn in field_names})
size += 1
except TypeError:
Expand Down
113 changes: 23 additions & 90 deletions openaddr/conform.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ def gdal_error_handler(err_class, err_num, err_msg):

# Field names for use in cached CSV files.
# We add columns to the extracted CSV with our own data with these names.
GEOM_FIELDNAME = 'OA:GEOM'
GEOM_FIELDNAME = 'oa:geom'

ADDRESSES_SCHEMA = [ 'HASH', 'NUMBER', 'STREET', 'UNIT', 'CITY', 'DISTRICT', 'REGION', 'POSTCODE', 'ID' ]
BUILDINGS_SCHEMA = [ 'HASH']
PARCELS_SCHEMA = [ 'HASH', 'PID' ]
ADDRESSES_SCHEMA = [ 'hash', 'number', 'street', 'unit', 'city', 'district', 'region', 'postcode', 'id' ]
BUILDINGS_SCHEMA = [ 'hash']
PARCELS_SCHEMA = [ 'hash', 'pid' ]
RESERVED_SCHEMA = ADDRESSES_SCHEMA + BUILDINGS_SCHEMA + PARCELS_SCHEMA + [
"LAT",
"LON"
"lat",
"lon"
]

UNZIPPED_DIRNAME = 'unzipped'
Expand Down Expand Up @@ -612,7 +612,6 @@ def csv_source_to_csv(source_config, source_path, dest_path):
source_config.data_source["conform"]["lat"]
]

old_ll.extend([s.upper() for s in old_ll])
out_fieldnames = [fn for fn in reader.fieldnames if fn not in old_ll]
out_fieldnames.append(GEOM_FIELDNAME)

Expand Down Expand Up @@ -722,19 +721,12 @@ def row_extract_and_reproject(source_config, source_row):
lat_name = data_source["conform"]["lat"]
lon_name = data_source["conform"]["lon"]

if lon_name in source_row:
source_x = source_row[lon_name]
else:
source_x = source_row[lon_name.upper()]

if lat_name in source_row:
source_y = source_row[lat_name]
else:
source_y = source_row[lat_name.upper()]
source_x = source_row[lon_name]
source_y = source_row[lat_name]

# Remove lat/lng name from output row
for n in lon_name, lon_name.upper(), lat_name, lat_name.upper():
if n in out_row: del out_row[n]
for n in (lon_name, lat_name):
out_row.pop(n, None)

# Convert commas to periods for decimal numbers. (Not using locale.)
try:
Expand All @@ -751,7 +743,6 @@ def row_extract_and_reproject(source_config, source_row):
out_row[GEOM_FIELDNAME] = None
return out_row


# Reproject the coordinates if necessary
if "srs" in data_source["conform"] and data_source["conform"]["srs"] != "EPSG:4326":
try:
Expand Down Expand Up @@ -818,17 +809,14 @@ def row_function(sc, row, key, fxn):
def row_transform_and_convert(source_config, row):
"Apply the full conform transform and extract operations to a row"

# Some conform specs have fields named with a case different from the source
row = row_smash_case(source_config.data_source, row)

c = source_config.data_source["conform"]

"Attribute tags can utilize processing fxns"
for k, v in c.items():
if k.upper() in source_config.SCHEMA and type(v) is list:
if k in source_config.SCHEMA and isinstance(v, list):
"Lists are a concat shortcut to concat fields with spaces"
row = row_merge(source_config, row, k)
if k.upper() in source_config.SCHEMA and type(v) is dict:
if k in source_config.SCHEMA and isinstance(v, dict):
"Dicts are custom processing functions"
row = row_function(source_config, row, k, v)

Expand All @@ -847,49 +835,6 @@ def row_transform_and_convert(source_config, row):

return feat

def fxn_smash_case(fxn):
if "field" in fxn:
fxn["field"] = fxn["field"].lower()
if "fields" in fxn:
fxn["fields"] = [s.lower() for s in fxn["fields"]]
if "field_to_remove" in fxn:
fxn["field_to_remove"] = fxn["field_to_remove"].lower()
if "functions" in fxn:
for sub_fxn in fxn["functions"]:
fxn_smash_case(sub_fxn)

def conform_smash_case(data_source):
"Convert all named fields in data_source object to lowercase. Returns new object."
new_sd = copy.deepcopy(data_source)
conform = new_sd["conform"]

for k, v in conform.items():
if type(conform[k]) is str and k.upper() in RESERVED_SCHEMA:
conform[k] = v.lower()
if type(conform[k]) is list:
conform[k] = [s.lower() for s in conform[k]]
if type(conform[k]) is dict:
fxn_smash_case(conform[k])

if "functions" in conform[k] and type(conform[k]["functions"]) is list:
for function in conform[k]["functions"]:
if type(function) is dict:
if "field" in function:
function["field"] = function["field"].lower()

if "fields" in function:
function["fields"] = [s.lower() for s in function["fields"]]

if "field_to_remove" in function:
function["field_to_remove"] = function["field_to_remove"].lower()

return new_sd

def row_smash_case(sc, input):
"Convert all field names to lowercase. Slow, but necessary for imprecise conform specs."
output = { k.lower() : v for (k, v) in input.items() }
return output

def row_merge(sc, row, key):
"Merge multiple columns like 'Maple','St' to 'Maple St'"
merge_data = [row[field] for field in sc.data_source["conform"][key]]
Expand Down Expand Up @@ -1016,7 +961,7 @@ def row_fxn_chain(sc, row, key, fxn):

original_key = key

if var and var.upper().lstrip('OA:') not in sc.SCHEMA and var not in row:
if var and var.lstrip('oa:') not in sc.SCHEMA and var not in row:
row['oa:' + var] = u''
key = var

Expand All @@ -1026,7 +971,7 @@ def row_fxn_chain(sc, row, key, fxn):
if row.get('oa:' + key):
row[key] = row['oa:' + key]

row['oa:{}'.format(original_key.lower())] = row['oa:{}'.format(key)]
row['oa:{}'.format(original_key)] = row['oa:{}'.format(key)]

return row

Expand Down Expand Up @@ -1082,7 +1027,7 @@ def row_calculate_hash(cache_fingerprint, row):
def row_convert_to_out(source_config, row):
"Convert a row from the source schema to OpenAddresses output schema"

geom = row.get(GEOM_FIELDNAME.lower(), None)
geom = row.get(GEOM_FIELDNAME, None)
if geom == "POINT EMPTY" or geom == '':
geom = None

Expand All @@ -1096,20 +1041,19 @@ def row_convert_to_out(source_config, row):
wkt_parsed = wkt_loads(output["geometry"])
output["geometry"] = mapping(wkt_parsed)


for field in source_config.SCHEMA:
if row.get('oa:{}'.format(field.lower())) is not None:
if row.get('oa:{}'.format(field)) is not None:
# If there is an OA prefix, it is not a native field and was compiled
# via an attrib funciton or concatentation
output["properties"][field.lower()] = row.get('oa:{}'.format(field.lower()))
# via an attrib function or concatenation
output["properties"][field] = row.get('oa:{}'.format(field))
else:
# Get a native field as specified in the conform object
cfield = source_config.data_source['conform'].get(field.lower())
cfield = source_config.data_source['conform'].get(field)

if cfield:
output["properties"][field.lower()] = row.get(cfield.lower())
output["properties"][field] = row.get(cfield)
else:
output["properties"][field.lower()] = ''
output["properties"][field] = ''

return output

Expand Down Expand Up @@ -1150,9 +1094,6 @@ def transform_to_out_geojson(source_config, extract_path, dest_path):
extract_path: extracted CSV file to process
dest_path: path for output file in OpenAddress CSV
'''
# Convert all field names in the conform spec to lower case
source_config.data_source = conform_smash_case(source_config.data_source)

# Read through the extract CSV
with open(extract_path, 'r', encoding='utf-8') as extract_fp:
reader = csv.DictReader(extract_fp)
Expand Down Expand Up @@ -1192,13 +1133,6 @@ def conform_cli(source_config, source_path, dest_path):
def check_source_tests(source_config):
''' Return boolean status and a message if any tests failed.
'''
try:
# Convert all field names in the conform spec to lower case
source_config.data_source = conform_smash_case(source_config.data_source)
except:
# There may be problems in the source spec - ignore them for now.
source_config.data_source = source_config.data_source

source_test = source_config.data_source.get('test', {})
tests_enabled = source_test.get('enabled', True)
acceptance_tests = source_test.get('acceptance-tests')
Expand All @@ -1208,11 +1142,10 @@ def check_source_tests(source_config):
return None, None

for (index, test) in enumerate(acceptance_tests):
input = row_smash_case(source_config.data_source, test['inputs'])
output = row_smash_case(source_config.data_source, row_transform_and_convert(source_config, input))
output = row_transform_and_convert(source_config, test['inputs'])

actual = {k: v for (k, v) in output['properties'].items() if k in test['expected']}
expected = row_smash_case(source_config.data_source, test['expected'])
expected = test['expected']

if actual != expected:
expected_json = json.dumps(expected, ensure_ascii=False)
Expand Down
12 changes: 6 additions & 6 deletions openaddr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def test_single_car(self):
state = dict(zip(*json.load(file)))

self.assertIsNotNone(state['cache'])
self.assertEqual(state['fingerprint'], '23082fe4819682a6934b61443560160c')
self.assertEqual(state['fingerprint'], '4a8047f90dbfe176c2a2b148837dae36')
self.assertIsNotNone(state['processed'])
self.assertIsNotNone(state['preview'])
self.assertIsNotNone(state['pmtiles'])
Expand Down Expand Up @@ -557,7 +557,7 @@ def test_single_car_cached(self):
state = dict(zip(*json.load(file)))

self.assertIsNotNone(state['cache'])
self.assertEqual(state['fingerprint'], '1821b2e50a61ed04ac2213fbc7a1984d')
self.assertEqual(state['fingerprint'], '056bdaab3334e709bf29b0b2f1fcf8c4')
self.assertIsNotNone(state['processed'])
self.assertIsNone(state['preview'])

Expand All @@ -576,7 +576,7 @@ def test_single_car_old_cached(self):
state = dict(zip(*json.load(file)))

self.assertIsNotNone(state['cache'])
self.assertEqual(state['fingerprint'], '1821b2e50a61ed04ac2213fbc7a1984d')
self.assertEqual(state['fingerprint'], '056bdaab3334e709bf29b0b2f1fcf8c4')
self.assertIsNotNone(state['processed'])
self.assertIsNone(state['preview'])

Expand Down Expand Up @@ -1019,7 +1019,7 @@ def test_single_tx_waco(self):
self.assertEqual(rows[0]['properties']['region'], u'TX')
self.assertEqual(rows[0]['properties']['id'], u'')
self.assertEqual(rows[0]['properties']['number'], u'308')
self.assertEqual(rows[0]['properties']['hash'], u'431f816eebac0000')
self.assertEqual(rows[0]['properties']['hash'], u'5b2957c31a02e00e')
self.assertEqual(rows[0]['properties']['city'], u'Mcgregor')
self.assertEqual(rows[0]['geometry']['coordinates'], [-97.3961768, 31.4432706]),
self.assertEqual(rows[0]['properties']['street'], u'PULLEN ST')
Expand All @@ -1046,7 +1046,7 @@ def test_single_wy_park(self):
rows = list(map(json.loads, list(input)))
self.assertEqual(rows[0]['properties']['id'], u'')
self.assertEqual(rows[0]['properties']['number'], u'162')
self.assertEqual(rows[0]['properties']['hash'], u'730e5ad1893108e4')
self.assertEqual(rows[0]['properties']['hash'], u'0488f0771f0ff30f')
self.assertEqual(rows[0]['properties']['city'], u'')
self.assertEqual(rows[0]['geometry']['type'], 'Point');
self.assertAlmostEqual(rows[0]['geometry']['coordinates'][0], -108.7563613);
Expand Down Expand Up @@ -1075,7 +1075,7 @@ def test_single_ny_orange(self):
rows = list(map(json.loads, list(input)))
self.assertEqual(rows[0]['properties']['id'], u'')
self.assertEqual(rows[0]['properties']['number'], u'434')
self.assertEqual(rows[0]['properties']['hash'], u'8cb84b9e793a4986')
self.assertEqual(rows[0]['properties']['hash'], u'd129b77ffa481fea')
self.assertEqual(rows[0]['properties']['city'], u'MONROE')
self.assertEqual(rows[0]['geometry']['coordinates'], [-74.1926686, 41.3187728])
self.assertEqual(rows[0]['properties']['street'], u'')
Expand Down
Loading

0 comments on commit 20996e1

Please sign in to comment.