-
Notifications
You must be signed in to change notification settings - Fork 2
136 lines (107 loc) · 4.75 KB
/
provider-map-jobs.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
name: Provider Map Jobs
on:
# TODO: Remove this once behavior has been finalized
push:
branches:
- vlad/provider-map-gha
workflow_dispatch:
schedule:
- cron: "0 0 * * 0"
jobs:
fetch_warehouse:
name: Fetch Warehouse Updates
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Authenticate with Google
uses: google-github-actions/auth@v1
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v1
- name: Download providers.csv from warehouse
run: |
bq query \
--quiet \
--headless \
--format=csv \
--use_legacy_sql=false \
"SELECT * FROM \`mart_transit_database.dim_mobility_mart_providers\`" \
> src/metadata/providers/providers.csv
- name: Fix our CSV file
run: |
# Workaround because of...
# https://github.com/google-github-actions/setup-gcloud/issues/666
sed -i -n -e '/agency_name/,$p' src/metadata/providers/providers.csv
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install pandas
run: |
pip install pandas
- name: Update Counties GeoJSON file
shell: python
run: |
import json
import pandas as pd
providers_file = 'src/metadata/providers/providers.csv'
df = pd.read_csv(providers_file')
city_lookup = pd.read_csv('src/metadata/cities_to_county.csv')
city_to_county = dict(zip(city_lookup['City'], city_lookup['County']))
lookup_records = df[df['counties_served'].isna()]['ntd_id']
# Fill in the null values for counties served with the HQ county
for record in lookup_records:
city = df[df['ntd_id'] == record]['hq_city'].values[0]
try:
county = city_to_county[city] or city_to_county[f'City of {city}']
df.loc[df['ntd_id'] == record, 'hq_county'] = county
df.loc[df['ntd_id'] == record, 'counties_served'] = county
except KeyError:
print("No county found for city: ", city)
df.to_csv(providers_file)
# Do a group by for the counties served
county_counts = df['counties_served'].str.split(';') \
.explode('counties_served') \
.value_counts()
geojson_file = 'src/metadata/providers/counties.geojson'
geojson = json.load(open(geojson_file))
# Add the county counts to the geojson
for feature in geojson['features']:
county_name = feature['properties']['county']
if county_name in county_counts:
feature['properties']['num_providers'] = int(county_counts[county_name])
else:
feature['properties']['num_providers'] = 0
# Write the geojson back to the file
with open(geojson_file, 'w') as f:
json.dump(geojson, f, indent=2)
- name: Setup yq
uses: vegardit/gha-setup-yq@v1
with:
use-cache: true
version: 4.40.5
- name: Check for column updates
id: column-updates
run: |
curl -sO https://raw.githubusercontent.com/cal-itp/data-infra/main/warehouse/models/mart/transit_database/_mart_transit_database.yml
columns_from_dbt=$(yq '.models[] | select(.name == "dim_mobility_mart_providers") | .columns[].name' _mart_transit_database.yml)
columns_from_repo=$(yq '.[].column' src/metadata/providers/dictionary.csv)
column_diff=$(diff <( printf '%s\n' "$columns_from_dbt" ) <( printf '%s\n' "$columns_from_repo" ))
echo "column_diff=$column_diff" >> "$GITHUB_OUTPUT"
- name: Create Pull Request
uses: peter-evans/create-pull-request@v5
with:
title: Provider Map Data Auto Update
body: |
It's that time again! The warehouse has delivered new data for us to use. This is an automatic pull request created by the `provider-map-jobs.yml` workflow; it is triggered via a cron that runs every Sunday at midnight UTC.
## Changed Columns
These are columns that differ between the warehouse and the repository. If you see descrepancies here, please update the `src/metadata/providers/dictionary.csv` file to match the warehouse.
```diff
${{ steps.column-updates.outputs.column_diff }}}
```
commit-message: Auto-update provider data from warehouse
add-paths: |
src/metadata/providers/providers.csv
src/metadata/providers/counties.geojson
base: main