fix(provider-map): handle new datasets #13
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Provider Map Jobs | |
on: | |
# TODO: Remove this once behavior has been finalized | |
push: | |
branches: | |
- vlad/provider-map-gha | |
workflow_dispatch: | |
schedule: | |
- cron: "0 0 * * 0" | |
jobs: | |
fetch_warehouse: | |
name: Fetch Warehouse Updates | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Authenticate with Google | |
uses: google-github-actions/auth@v1 | |
with: | |
credentials_json: '${{ secrets.GCP_SA_KEY }}' | |
- name: Set up Cloud SDK | |
uses: google-github-actions/setup-gcloud@v1 | |
- name: Download providers.csv from warehouse | |
run: | | |
bq query \ | |
--quiet \ | |
--headless \ | |
--format=csv \ | |
--use_legacy_sql=false \ | |
"SELECT * FROM \`mart_transit_database.dim_mobility_mart_providers\`" \ | |
> src/metadata/providers/providers.csv | |
- name: Fix our CSV file | |
run: | | |
# Workaround because of... | |
# https://github.com/google-github-actions/setup-gcloud/issues/666 | |
sed -i -n -e '/agency_name/,$p' src/metadata/providers/providers.csv | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10' | |
- name: Install pandas | |
run: | | |
pip install pandas | |
- name: Update Counties GeoJSON file | |
shell: python | |
run: | | |
import json | |
import pandas as pd | |
providers_file = 'src/metadata/providers/providers.csv' | |
df = pd.read_csv(providers_file) | |
city_lookup = pd.read_csv('src/metadata/cities_to_county.csv') | |
city_to_county = dict(zip(city_lookup['City'], city_lookup['County'])) | |
lookup_records = df[df['counties_served'].isna()]['ntd_id'] | |
# Fill in the null values for counties served with the HQ county | |
for record in lookup_records: | |
city = df[df['ntd_id'] == record]['hq_city'].values[0] | |
try: | |
county = city_to_county[city] or city_to_county[f'City of {city}'] | |
df.loc[df['ntd_id'] == record, 'hq_county'] = county | |
df.loc[df['ntd_id'] == record, 'counties_served'] = county | |
except KeyError: | |
print("No county found for city: ", city) | |
df.to_csv(providers_file) | |
# Do a group by for the counties served | |
county_counts = df['counties_served'].str.split(';') \ | |
.explode('counties_served') \ | |
.value_counts() | |
geojson_file = 'src/metadata/providers/counties.geojson' | |
geojson = json.load(open(geojson_file)) | |
# Add the county counts to the geojson | |
for feature in geojson['features']: | |
county_name = feature['properties']['county'] | |
if county_name in county_counts: | |
feature['properties']['num_providers'] = int(county_counts[county_name]) | |
else: | |
feature['properties']['num_providers'] = 0 | |
# Write the geojson back to the file | |
with open(geojson_file, 'w') as f: | |
json.dump(geojson, f, indent=2) | |
- name: Create Pull Request | |
uses: peter-evans/create-pull-request@v5 | |
with: | |
title: Provider Map Data Auto Update | |
body: | | |
It's that time again! The warehouse has delivered new data for us to use. This is an automatic pull request created by the `provider-map-jobs.yml` workflow; it is triggered via a cron that runs every Sunday at midnight UTC. | |
commit-message: Auto-update provider data from warehouse | |
add-paths: | | |
src/metadata/providers/providers.csv | |
src/metadata/providers/counties.geojson | |
base: main |