From b8ddbe9df46039f071ee8eb243ed2875b95e0d77 Mon Sep 17 00:00:00 2001 From: Erik Knudsen <31932266+erik1066@users.noreply.github.com> Date: Tue, 18 Aug 2020 21:48:51 -0400 Subject: [PATCH] Initial commit --- .gitignore | 5 ++ README.md | 126 ++++++++++++++++++++++++++++++++ src/ak_scraper.py | 55 ++++++++++++++ src/al_scraper.py | 76 ++++++++++++++++++++ src/ar_scraper.py | 44 ++++++++++++ src/ca_scraper.py | 70 ++++++++++++++++++ src/co_scraper.py | 58 +++++++++++++++ src/config/nv_post_body.json | 1 + src/county_report.py | 17 +++++ src/ct_scraper.py | 57 +++++++++++++++ src/de_scraper.py | 55 ++++++++++++++ src/fl_scraper.py | 58 +++++++++++++++ src/ga_scraper.py | 51 +++++++++++++ src/id_scraper.py | 82 +++++++++++++++++++++ src/il_scraper.py | 78 ++++++++++++++++++++ src/in_scraper.py | 46 ++++++++++++ src/la_scraper.py | 70 ++++++++++++++++++ src/main.py | 135 +++++++++++++++++++++++++++++++++++ src/mi_scraper.py | 47 ++++++++++++ src/mn_scraper.py | 49 +++++++++++++ src/mo_scraper.py | 46 ++++++++++++ src/ms_scraper.py | 46 ++++++++++++ src/mt_scraper.py | 59 +++++++++++++++ src/nc_scraper.py | 77 ++++++++++++++++++++ src/ne_scraper.py | 51 +++++++++++++ src/nm_scraper.py | 50 +++++++++++++ src/nv_scraper.py | 69 ++++++++++++++++++ src/nyc_scraper.py | 48 +++++++++++++ src/oh_scraper.py | 69 ++++++++++++++++++ src/ok_scraper.py | 56 +++++++++++++++ src/sc_scraper.py | 71 ++++++++++++++++++ src/state_report.py | 23 ++++++ src/tn_scraper.py | 75 +++++++++++++++++++ src/tx_scraper.py | 59 +++++++++++++++ src/va_scraper.py | 52 ++++++++++++++ src/vt_scraper.py | 62 ++++++++++++++++ src/wi_scraper.py | 67 +++++++++++++++++ 37 files changed, 2160 insertions(+) create mode 100644 README.md create mode 100644 src/ak_scraper.py create mode 100644 src/al_scraper.py create mode 100644 src/ar_scraper.py create mode 100644 src/ca_scraper.py create mode 100644 src/co_scraper.py create mode 100644 src/config/nv_post_body.json create mode 100644 src/county_report.py create mode 100644 src/ct_scraper.py create mode 100644 src/de_scraper.py create mode 100644 src/fl_scraper.py create mode 100644 src/ga_scraper.py create mode 100644 src/id_scraper.py create mode 100644 src/il_scraper.py create mode 100644 src/in_scraper.py create mode 100644 src/la_scraper.py create mode 100644 src/main.py create mode 100644 src/mi_scraper.py create mode 100644 src/mn_scraper.py create mode 100644 src/mo_scraper.py create mode 100644 src/ms_scraper.py create mode 100644 src/mt_scraper.py create mode 100644 src/nc_scraper.py create mode 100644 src/ne_scraper.py create mode 100644 src/nm_scraper.py create mode 100644 src/nv_scraper.py create mode 100644 src/nyc_scraper.py create mode 100644 src/oh_scraper.py create mode 100644 src/ok_scraper.py create mode 100644 src/sc_scraper.py create mode 100644 src/state_report.py create mode 100644 src/tn_scraper.py create mode 100644 src/tx_scraper.py create mode 100644 src/va_scraper.py create mode 100644 src/vt_scraper.py create mode 100644 src/wi_scraper.py diff --git a/.gitignore b/.gitignore index b6e4761..3d0918a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# Project-specific things to ignore +src/temp/ +src/output/ +*.exe + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md new file mode 100644 index 0000000..a868fac --- /dev/null +++ b/README.md @@ -0,0 +1,126 @@ +# COVID-19 County-level Web Scraper Project + +I created this project because we seem absent a way to transparently determine county-level counts of confirmed cases, deaths, and hospitalizations. Publicly-available data from the U.S. state health departments is used as input. + +## States/territories supported as of 8/18/2020 + +- [x] Alabama +- [x] Alaska +- [ ] Arizona +- [x] Arkansas +- [x] California +- [x] Colorado +- [x] Connecticut +- [x] Delaware +- [x] Florida +- [x] Georgia +- [ ] Hawaii +- [x] Idaho +- [x] Illinois +- [x] Indiana +- [ ] Iowa +- [ ] Kansas +- [ ] Kentucky +- [x] Louisiana +- [ ] Maine +- [ ] Maryland +- [ ] Massachusetts +- [x] Michigan +- [x] Minnesota +- [x] Mississippi +- [x] Missouri +- [x] Montana +- [x] Nebraska +- [x] Nevada +- [ ] New Hampshire +- [ ] New Jersey +- [x] New Mexico +- [x] New York City +- [ ] New York (excluding NYC) +- [x] North Carolina +- [ ] North Dakota +- [x] Ohio +- [x] Oklahoma +- [ ] Oregon +- [ ] Pennsylvania +- [ ] Rhode Island +- [x] South Carolina +- [ ] South Dakota +- [x] Tennessee +- [x] Texas +- [ ] Utah +- [x] Vermont +- [x] Virginia +- [ ] Washington +- [ ] West Virginia +- [x] Wisconsin +- [ ] Wyoming +- [ ] American Samoa +- [ ] District of Columbia +- [ ] Guam +- [ ] Northern Mariana Islands +- [ ] U.S. Virgin Islands +- [ ] Puerto Rico +- [ ] Palau +- [ ] Federated States of Micronesia +- [ ] Republic of Marshall Islands +- [ ] Navajo Nation + +## Breakages + +In the roughly 16 hours of development time that it took me to write and test these algorithms, three feeds from U.S. state health departments changed slightly. Even these slight changes caused those states to not generate output. Rework of their respective scraping algorithms was required. + +It is likely that continuous development work will be required to keep the scraper project up-to-date for use in daily reporting. + +## Missing data + +Some states will never be represented in this project because county-level data is either not published by those states or it is too difficult to obtain with even advanced web scraping techniques. + +## Running the code yourself + +Install Python 3 and then use `pip` to install the following packages: + +```bash +pip install openpyxl +pip install bs4 +pip install selenium +``` + +Some states' data is only accessible by using web browser automation. As such, you will need to install a web driver for the scraping operation before you can run the Python code. You first need to install the new Microsoft Edge browser for Windows 10: https://www.microsoft.com/en-us/edge. Note that Edge may already be installed. + +Once installed, you will then need to find the version number of Edge. You can do this by opening Edge and clicking the ellipsis button at the top right of the screen. Select **Help and Feedback** > **About Microsoft Edge**. Note the version number in the **About** page that appears. + +Next, modify the Edge webdriver URL found in the `installEdgeDriver` function of `main.py`. You'll want to modify this URL to match the version you just saw in the Edge **About** page. Visit https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ to find a valid URL that matches your version of Edge. Copy and paste the URL from that page into the Python code. Generally, as long as the major version number is the same between the **About** page and what's listed on the [Microsoft webdriver website](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/), it'll probably work. + +> Edge is updated every few weeks, so changing the Python URL to match your Edge version is likely going to be required on a periodic basis. + +Finally, navigate to the `src` folder and run `main.py`: + +```bash +cd src +python main.py +``` +Output should start to generate after a few seconds. Web browser windows will appear on occasion; please do not close the browser windows that appear or the scaping operation will fail. + +Once the operation completes, please open the `src/output` folder to view a timestamped CSV file representing all county-level data for all states that were included in the scraping operation. + +> On Ubuntu or other Linux-based OS distributions, you may need to use the `pip3` command instead of `pip` and `python3` instead of `python`. + +> Because this scraping project relies on web drivers to deal with JavaScript-intense pages for a small subset of states, you will need to be running Windows and MS Edge to obtain a full CSV output. A long-term TODO is to use headless Firefox or Chromium so this will run on *nix-based distributions or on Windows Subsystem for Linux (WSL). + +## Excluding states from the scraping operation + +You can exclude states from the scraper by commenting them out in `main.py`. Any state scraper not included in the `scrapers` array will not be run. + +## License +The repository utilizes code licensed under the terms of the Apache Software License and therefore is licensed under ASL v2 or later. + +This source code in this repository is free: you can redistribute it and/or modify it under +the terms of the Apache Software License version 2, or (at your option) any later version. + +This source code in this repository is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the Apache Software License for more details. + +You should have received a copy of the Apache Software License along with this program. If not, see https://www.apache.org/licenses/LICENSE-2.0.html + +The source code forked from other open source projects will inherit its license. \ No newline at end of file diff --git a/src/ak_scraper.py b/src/ak_scraper.py new file mode 100644 index 0000000..ba08a0e --- /dev/null +++ b/src/ak_scraper.py @@ -0,0 +1,55 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'AK' +STATE = 'Alaska' + +def scraper(): + # make an HTTP web request to get the AK Json + response = requests.get('https://services1.arcgis.com/WzFsmainVTuD5KML/arcgis/rest/services/Geographic_Distribution_of_Confirmed_Cases/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + features = jsonPayload['features'] + + counties = [] + + for feature in features: + attribute = feature['attributes'] + + county_name = attribute['Borough_Census_Area'] + confirmed = int(attribute['All_Cases']) + hospitalizations = int(attribute['Hospitalizations']) + deaths = int(attribute['Deaths']) + + county = findCounty(county_name, counties) + + if county == None: + county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) + counties.append(county) + else: + county.confirmed += confirmed + county.hospitalizations += hospitalizations + county.deaths += deaths + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/al_scraper.py b/src/al_scraper.py new file mode 100644 index 0000000..0a3df64 --- /dev/null +++ b/src/al_scraper.py @@ -0,0 +1,76 @@ +import requests, io, datetime, pathlib, sys, time, os, openpyxl +import county_report, state_report +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +STATE_ABBR = 'AL' +STATE = 'Alabama' + +URL = 'https://dph1.adph.state.al.us/covid-19/' + +FILE_NAME = 'COVID-19 in Alabama.xlsx' + +def scraper(): + counties = [] + + # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ + + try: + + browser = webdriver.Edge("msedgedriver.exe") + browser.get(URL) + + file_path = pathlib.Path.home().joinpath('Downloads', FILE_NAME) + + if os.path.isfile(file_path): + print(" FAILED on ", STATE, " : Please delete ", file_path, " and start the process over. This file must not exist prior to running the scrape operation.") + + download_link = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div[2]/div/div[1]/div/div[1]/a[2]'))) + download_link.click() + + time.sleep(4) + + wb = openpyxl.load_workbook(filename=file_path) + + sheet = wb.worksheets[0] + + counties = [] + + max_rows = sheet.max_row + + for i in range(2, max_rows): + rowCount = str(i) + # print(rowCount) + county_name = sheet['A' + rowCount].value + + + if county_name == None or len(county_name) == 0: + continue + + confirmed = sheet['B' + rowCount].value + deaths = sheet['D' + rowCount].value + + county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) + counties.append(county) # append the countyReport to our list of counties + + wb.close() + + except: + print("Unexpected error:", sys.exc_info()[0]) + + browser.quit() + + os.remove(file_path) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport \ No newline at end of file diff --git a/src/ar_scraper.py b/src/ar_scraper.py new file mode 100644 index 0000000..692c30a --- /dev/null +++ b/src/ar_scraper.py @@ -0,0 +1,44 @@ +import requests, bs4, datetime +import county_report, state_report + +STATE_ABBR = 'AR' +STATE = 'Arkansas' + +def scraper(): + # make an HTTP web request to get the AR data + response = requests.get('https://www.healthy.arkansas.gov/programs-services/topics/covid-19-county-data') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + table = bs4.BeautifulSoup(response.text, features="html.parser").select('table tr') + + counties = [] + + for i in range (1, 75): + row = table[i].find_all('td') + county_name = row[0].find('p').getText() + confirmed = int(row[1].find('p').getText()) + deaths = int(row[3].find('p').getText()) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/ca_scraper.py b/src/ca_scraper.py new file mode 100644 index 0000000..ddc74d0 --- /dev/null +++ b/src/ca_scraper.py @@ -0,0 +1,70 @@ +import requests, io, csv, datetime +import county_report, state_report + +STATE_ABBR = 'CA' +STATE = 'California' + +def scraper(): + # make an HTTP web request to get the CA CSV file + response = requests.get('https://data.ca.gov/dataset/590188d5-8545-4c93-a9a0-e230f0db7290/resource/926fd08f-cc91-4828-af38-bd45de97f8c3/download/statewide_cases.csv') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + csvData = response.text + + # read the in-memory string using the 'csv' module so we can iterate over each row + csvReader = csv.reader(csvData.splitlines(), delimiter=',', quotechar='"') + + # create a list that will contain our county data + counties = [] + + # iterate over every row in the CSV + for row in csvReader: + # skip the header row + if row[0] == 'county': + continue + + county_name = row[0] + confirmedStr = row[1] + confirmed = 0 + if '.' in confirmedStr: + confirmed = int(float(confirmedStr)) + elif len(confirmedStr) > 0: + confirmed = int(confirmedStr) + + deathsStr = row[2] + deaths = 0 + if '.' in deathsStr: + deaths = int(float(deathsStr)) + elif len(deathsStr) > 0: + deaths = int(deathsStr) + + county = findCounty(county_name, counties) + + if county == None: + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) # append the countyReport to our list of counties + else: + county.confirmed = confirmed + county.deaths = deaths + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Download failed - HTTP status code ', response.status_code) + + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county \ No newline at end of file diff --git a/src/co_scraper.py b/src/co_scraper.py new file mode 100644 index 0000000..39deca2 --- /dev/null +++ b/src/co_scraper.py @@ -0,0 +1,58 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'CO' +STATE = 'Colorado' + +def scraper(): + # make an HTTP web request to get the CO Json + response = requests.get('https://services3.arcgis.com/66aUo8zsujfVXRIT/arcgis/rest/services/colorado_covid19_county_statistics_cumulative/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + features = jsonPayload['features'] + + counties = [] + + for feature in features: + attribute = feature['attributes'] + + county_name = attribute['LABEL'] + + county = findCounty(county_name, counties) + + if county == None: + county = county_report.CountyReport(STATE, county_name, 0, 0, -1, -1, datetime.datetime.now()) + counties.append(county) + + metric = attribute['Metric'] + + if metric == 'Cases': + confirmed = int(attribute['Value']) + county.confirmed = confirmed + + if metric == 'Deaths': + deaths = int(attribute['Value']) + county.deaths = deaths + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/config/nv_post_body.json b/src/config/nv_post_body.json new file mode 100644 index 0000000..32335ad --- /dev/null +++ b/src/config/nv_post_body.json @@ -0,0 +1 @@ +{"version":"1.0.0","queries":[{"Query":{"Commands":[{"SemanticQueryDataShapeCommand":{"Query":{"Version":2,"From":[{"Name":"c","Entity":"County","Type":0},{"Name":"s","Entity":"Sheet1","Type":0}],"Select":[{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"County"},"Name":"County.County"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Deaths"}},"Function":0},"Name":"Sum(County.Deaths)"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Tests"}},"Function":0},"Name":"Sum(County.Tests)"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Total Cases"}},"Function":0},"Name":"Sum(County.Total Cases)"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Death Rate per 100,000"}},"Function":0},"Name":"Min(County.Death Rate per 100,000)"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Case Rate per 100,000"}},"Function":0},"Name":"Sum(County.Case Rate per 100,000)"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Positivity Rate"}},"Function":0},"Name":"CountNonNull(County.Positivity Rate)"}],"Where":[{"Condition":{"In":{"Expressions":[{"Column":{"Expression":{"SourceRef":{"Source":"s"}},"Property":"RESULT"}}],"Values":[[{"Literal":{"Value":"'Total People Tested'"}}]]}}}],"OrderBy":[{"Direction":2,"Expression":{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Case Rate per 100,000"}},"Function":0}}}]},"Binding":{"Primary":{"Groupings":[{"Projections":[0,1,2,3,4,5,6]}]},"DataReduction":{"DataVolume":4,"Primary":{"Top":{}}},"Aggregates":[{"Select":5,"Aggregations":[{"Min":{}},{"Max":{}}]}],"SuppressedJoinPredicates":[1,2,3,4,6],"Version":1}}}]},"CacheKey":"{\"Commands\":[{\"SemanticQueryDataShapeCommand\":{\"Query\":{\"Version\":2,\"From\":[{\"Name\":\"c\",\"Entity\":\"County\",\"Type\":0},{\"Name\":\"s\",\"Entity\":\"Sheet1\",\"Type\":0}],\"Select\":[{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"County\"},\"Name\":\"County.County\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Deaths\"}},\"Function\":0},\"Name\":\"Sum(County.Deaths)\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Tests\"}},\"Function\":0},\"Name\":\"Sum(County.Tests)\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Total Cases\"}},\"Function\":0},\"Name\":\"Sum(County.Total Cases)\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Death Rate per 100,000\"}},\"Function\":0},\"Name\":\"Min(County.Death Rate per 100,000)\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Case Rate per 100,000\"}},\"Function\":0},\"Name\":\"Sum(County.Case Rate per 100,000)\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Positivity Rate\"}},\"Function\":0},\"Name\":\"CountNonNull(County.Positivity Rate)\"}],\"Where\":[{\"Condition\":{\"In\":{\"Expressions\":[{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"s\"}},\"Property\":\"RESULT\"}}],\"Values\":[[{\"Literal\":{\"Value\":\"'Total People Tested'\"}}]]}}}],\"OrderBy\":[{\"Direction\":2,\"Expression\":{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Case Rate per 100,000\"}},\"Function\":0}}}]},\"Binding\":{\"Primary\":{\"Groupings\":[{\"Projections\":[0,1,2,3,4,5,6]}]},\"DataReduction\":{\"DataVolume\":4,\"Primary\":{\"Top\":{}}},\"Aggregates\":[{\"Select\":5,\"Aggregations\":[{\"Min\":{}},{\"Max\":{}}]}],\"SuppressedJoinPredicates\":[1,2,3,4,6],\"Version\":1}}}]}","QueryId":"","ApplicationContext":{"DatasetId":"23b35406-eaa1-4c4c-9270-c9c5978432c6","Sources":[{"ReportId":"18636c78-00fa-41b0-8364-136fc9a8041e"}]}}],"cancelQueries":[],"modelId":272235} \ No newline at end of file diff --git a/src/county_report.py b/src/county_report.py new file mode 100644 index 0000000..b888c72 --- /dev/null +++ b/src/county_report.py @@ -0,0 +1,17 @@ +class CountyReport: + state = '' + county = '' + confirmed = -1 + deaths = -1 + hospitalizations = -1 + caserate = -1 + timestamp = '' + + def __init__(self, state, county, confirmed, deaths, hospitalizations, caserate, timestamp): + self.state = state + self.county = county + self.confirmed = confirmed + self.deaths = deaths + self.hospitalizations = hospitalizations + self.caserate = caserate + self.timestamp = timestamp \ No newline at end of file diff --git a/src/ct_scraper.py b/src/ct_scraper.py new file mode 100644 index 0000000..1389c8f --- /dev/null +++ b/src/ct_scraper.py @@ -0,0 +1,57 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'CT' +STATE = 'Connecticut' + +def scraper(): + # make an HTTP web request to get the CT Json file + response = requests.get('https://data.ct.gov/resource/bfnu-rgqt.json') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + + counties = [] + + for item in jsonPayload: + county_name = item['county'] + + confirmed = 0 + if 'confirmedcases' in item: + confirmed = int(item['confirmedcases']) + + hospitalizations = 0 + if 'hospitalization' in item: + hospitalizations = int(item['hospitalization']) + + deaths = 0 + if 'confirmeddeaths' in item: + deaths = int(item['confirmeddeaths']) + + county = findCounty(county_name, counties) + + if county == None: + county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/de_scraper.py b/src/de_scraper.py new file mode 100644 index 0000000..b055eca --- /dev/null +++ b/src/de_scraper.py @@ -0,0 +1,55 @@ +import requests, json, io, datetime, pathlib, sys, time, os, csv +from io import StringIO +import county_report, state_report +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +STATE_ABBR = 'DE' +STATE = 'Delaware' + +URL = 'https://myhealthycommunity.dhss.delaware.gov/embed/covid19/bg/white' + +def scraper(): + counties = [] + + # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ + + try: + + browser = webdriver.Edge("msedgedriver.exe") + browser.get(URL) + + county1 = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div/section/div/article/div/div[1]/div[2]/div[2]/div[2]/div/div/div[2]/div/div[1]/a/div[2]/span'))) + county2 = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div/section/div/article/div/div[1]/div[2]/div[2]/div[2]/div/div/div[2]/div/div[2]/a/div[2]/span'))) + county3 = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div/section/div/article/div/div[1]/div[2]/div[2]/div[2]/div/div/div[2]/div/div[3]/a/div[2]/span'))) + + county1_cases = county1.text.replace(',', '') + county2_cases = county2.text.replace(',', '') + county3_cases = county3.text.replace(',', '') + + county = county_report.CountyReport(STATE, 'New Castle', int(county1_cases), -1, -1, -1, datetime.datetime.now()) + counties.append(county) + + county = county_report.CountyReport(STATE, 'Kent', int(county2_cases), -1, -1, -1, datetime.datetime.now()) + counties.append(county) + + county = county_report.CountyReport(STATE, 'Sussex', int(county3_cases), -1, -1, -1, datetime.datetime.now()) + counties.append(county) + + except: + print("Unexpected error:", sys.exc_info()[0]) + + browser.quit() + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport \ No newline at end of file diff --git a/src/fl_scraper.py b/src/fl_scraper.py new file mode 100644 index 0000000..8636ab2 --- /dev/null +++ b/src/fl_scraper.py @@ -0,0 +1,58 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'FL' +STATE = 'Florida' + +def scraper(): + # make an HTTP web request to get the FL Json file + response = requests.get('https://opendata.arcgis.com/datasets/a7887f1940b34bf5a02c6f7f27a5cb2c_0.geojson') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + items = jsonPayload['features'] + + counties = [] + + for item in items: + attributes = item['properties'] + county_name = attributes['County_1'] + + if county_name == 'State': # this is FL's total, so skip + continue + + confirmedStr = attributes['CasesAll'] + confirmed = int(confirmedStr) + + deathsStr = attributes['Deaths'] + deaths = int(deathsStr) + + hospitalizationsResStr = attributes['C_HospYes_Res'] # hospitalizations - Florida residents + hospitalizationsRes = int(hospitalizationsResStr) + + hospitalizationsNonResStr = attributes['C_HospYes_NonRes'] # hospitalizations - Florida residents + hospitalizationsNonRes = int(hospitalizationsNonResStr) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, hospitalizationsRes + hospitalizationsNonRes, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/ga_scraper.py b/src/ga_scraper.py new file mode 100644 index 0000000..953bc85 --- /dev/null +++ b/src/ga_scraper.py @@ -0,0 +1,51 @@ +import requests, zipfile, io, csv, datetime +import county_report, state_report + +STATE_ABBR = 'GA' +STATE = 'Georgia' + +def scraper(): + # make an HTTP web request to get the GA ZIP file + response = requests.get('https://ga-covid19.ondemand.sas.com/docs/ga_covid_data.zip') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': ZIP downloaded succeeded') + + # read ZIP into memory + z = zipfile.ZipFile(io.BytesIO(response.content)) + + # extract the CSV file from the ZIP file into an in-memory byte array + csvDataBytes = z.read('countycases.csv') + + # convert the byte array into a string so we can read it as a CSV file + csvData = csvDataBytes.decode(encoding='UTF-8') + + # read the in-memory string using the 'csv' module so we can iterate over each row + csvReader = csv.reader(csvData.splitlines(), delimiter=',', quotechar='"') + + # create a list that will contain our county data + counties = [] + + # iterate over every row in the CSV + for row in csvReader: + # skip the header row + if row[0] == 'county_resident': + continue + + # take the row we're iterating over and build a countyReport object out of it - this has the confirmed cases, deaths, etc that we're interested in + county = county_report.CountyReport(STATE, row[0], (int)(row[1]), (int)(row[2]), (int)(row[3]), (float)(row[4]), datetime.datetime.now()) + counties.append(county) # append the countyReport to our list of counties + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : ZIP download failed - HTTP status code ', response.status_code) diff --git a/src/id_scraper.py b/src/id_scraper.py new file mode 100644 index 0000000..92cdcf2 --- /dev/null +++ b/src/id_scraper.py @@ -0,0 +1,82 @@ +import requests, json, io, datetime, pathlib, sys, time, os, csv +from io import StringIO +import county_report, state_report +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +STATE_ABBR = 'ID' +STATE = 'Idaho' + +URL = 'https://public.tableau.com/views/DPHIdahoCOVID-19Dashboard_V2/Story1?%3Aembed=y&%3AshowVizHome=no&%3Adisplay_count=y&%3Adisplay_static_image=y&%3AbootstrapWhenNotified=true&%3Alanguage=en&:embed=y&:showVizHome=n&:apiID=host0#navType=0&navSrc=Parse' + +def scraper(): + counties = [] + + # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ + + try: + + browser = webdriver.Edge("msedgedriver.exe") + browser.get(URL) + + file_path = pathlib.Path.home().joinpath('Downloads', 'County Table.csv') + + if os.path.isfile(file_path): + print(" FAILED on ", STATE, " : Please delete ", file_path, " and start the process over. This file must not exist prior to running the scrape operation.") + + tab_link = browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[1]/div[1]/div/div[2]/div[4]/div/div/div/span[2]/div/span/span/span[3]/div[2]/div/div[1]/span/div') + tab_link.click() + + download_link = browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[2]/div[1]/div[2]/div[5]') + download_link.click() + + crosstab_link = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[6]/div/div/div/div/div[2]/div/button[3]'))) + crosstab_link.click() + + csv_link = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[7]/div/div/div/div/div[2]/div/div[2]/div[2]/div/label[2]'))) + csv_link.click() + + download_button = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[7]/div/div/div/div/div[2]/div/div[3]/button'))) + download_button.click() + time.sleep(2) + + with open(file_path, 'rt', encoding='utf-16-le') as file_contents: + data = file_contents.read() + infile = StringIO(data) + + with open(file_path) as csv_file: + csv_reader = csv.reader(data.splitlines(), delimiter='\t', quotechar='"') + + for row in csv_reader: + # print(row) + county_name = row[0] + if county_name == 'County' or row[1] == 'Public Health District' or row[2] == 'Confirmed Cases': + continue + + confirmed = row[2].replace(',', '') + deaths = row[7].replace(',', '') + if len(deaths) == 0: + deaths = '0' + + county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) + counties.append(county) + + except: + print("Unexpected error:", sys.exc_info()[0]) + + browser.quit() + + os.remove(file_path) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport \ No newline at end of file diff --git a/src/il_scraper.py b/src/il_scraper.py new file mode 100644 index 0000000..47d2afa --- /dev/null +++ b/src/il_scraper.py @@ -0,0 +1,78 @@ +import requests, json, io, datetime, pathlib, sys, time, os, csv +from io import StringIO +import county_report, state_report +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +STATE_ABBR = 'IL' +STATE = 'Illinois' + +URL = 'https://www.dph.illinois.gov/covid19/covid19-statistics' + +def get_row_data(table): + for row in table: + yield [td.text for td in row.find_elements_by_xpath(".//td")] + +def scraper(): + counties = [] + + # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ + + try: + + browser = webdriver.Edge("msedgedriver.exe") + browser.get(URL) + + time.sleep(4) + + county_link = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[3]/div/article/div/div/div/ul[1]/li[1]/a'))) + county_link.click() + + time.sleep(4) + + all_link = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="pagin"]/li[12]/a'))) + all_link.click() + + time.sleep(2) + + county_table = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[3]/div/article/div/div/div/table/tbody'))) + + time.sleep(2) + # print(county_table) + + htmlRows = county_table.find_elements_by_xpath(".//tr") + + # print(htmlRows) + rows = get_row_data(htmlRows) + # print(rows) + + for row in rows: + # print(row) + county_name = row[0] + + if county_name == 'Illinois': + continue + + confirmed = int(row[2]) + deaths = int(row[3]) + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + + except: + print("Unexpected error:", sys.exc_info()[0]) + + browser.quit() + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport \ No newline at end of file diff --git a/src/in_scraper.py b/src/in_scraper.py new file mode 100644 index 0000000..7b04a4b --- /dev/null +++ b/src/in_scraper.py @@ -0,0 +1,46 @@ +import requests, openpyxl, io, csv, datetime +import county_report, state_report + +STATE_ABBR = 'IN' +STATE = 'Indiana' +URL = 'https://hub.mph.in.gov/dataset/89cfa2e3-3319-4d31-a60d-710f76856588/resource/8b8e6cd7-ede2-4c41-a9bd-4266df783145/download/covid_report_county.xlsx' + +def scraper(): + # make an HTTP web request to get the file + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + data = io.BytesIO(response.content) + + wb = openpyxl.load_workbook(filename=data, read_only=True, data_only=True) + + sheet = wb.worksheets[0] + + counties = [] + + for i in range(2, 94): + rowCount = str(i) + + county = sheet['E' + rowCount].value + confirmed = sheet['B' + rowCount].value + deaths = sheet['C' + rowCount].value + + county = county_report.CountyReport(STATE, county, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) + counties.append(county) # append the countyReport to our list of counties + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) \ No newline at end of file diff --git a/src/la_scraper.py b/src/la_scraper.py new file mode 100644 index 0000000..15aae06 --- /dev/null +++ b/src/la_scraper.py @@ -0,0 +1,70 @@ +import requests, openpyxl, io, os, datetime, pathlib +import county_report, state_report + +STATE_ABBR = 'LA' +STATE = 'Louisiana' +URL = 'https://ldh.la.gov/assets/oph/Coronavirus/data/LA_COVID_TESTBYDAY_PARISH_PUBLICUSE.xlsx' + +def scraper(): + # make an HTTP web request to get the file + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + # Writing the XLSX to disk makes the loop below orders of magnitude faster + # versus keeping the XLSX doc in-memory, so we create a temp folder and download + # the file there. + temppath = 'temp' + if not os.path.exists(temppath): + os.makedirs(temppath) + + tempfilename = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") + '_temp_' + STATE_ABBR + '.xlsx' + tempfilepath = pathlib.Path.cwd().joinpath('temp', tempfilename) + + with open(tempfilepath, "wb") as file: + file.write(response.content) + + wb = openpyxl.load_workbook(filename=tempfilepath) + + sheet = wb.worksheets[0] + + parishes = [] + parishesDictionary = {} + + max_rows = sheet.max_row + + for i in range(2, max_rows): + rowCount = str(i) + # print(rowCount) + parish_name = sheet['B' + rowCount].value + confirmed = sheet['F' + rowCount].value + + parish = findParish(parish_name, parishesDictionary) + + if parish == None: + parish = county_report.CountyReport(STATE, parish_name, (int)(confirmed), -1, -1, -1, datetime.datetime.now()) + parishes.append(parish) + parishesDictionary[parish_name] = parish + else: + parish.confirmed += int(confirmed) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(parishes), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, parishes, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findParish(parish_name, parishesDictionary): + if parish_name in parishesDictionary: + return parishesDictionary[parish_name] + else: + return None \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..3189732 --- /dev/null +++ b/src/main.py @@ -0,0 +1,135 @@ +import csv, datetime, requests, zipfile, pathlib, os +import ak_scraper +import al_scraper +import ar_scraper +import ca_scraper +import co_scraper +import ct_scraper +import de_scraper +import fl_scraper +import ga_scraper +import id_scraper +import in_scraper +import il_scraper +import la_scraper +import mi_scraper +import mn_scraper +import ms_scraper +import mo_scraper +import mt_scraper +import ne_scraper +import nv_scraper +import nm_scraper +import nyc_scraper +import nc_scraper +import ok_scraper +import oh_scraper +import sc_scraper +import tn_scraper +import tx_scraper +import va_scraper +import vt_scraper +import wi_scraper + +def installEdgeDriver(): + + driver_path = pathlib.Path("msedgedriver.exe") + if driver_path.is_file(): + pass + else: + response = requests.get('https://msedgedriver.azureedge.net/84.0.524.0/edgedriver_win64.zip', stream=True) + + save_path = pathlib.Path.cwd().joinpath('temp', 'edgedriver_win64.zip') + with open(save_path, 'wb') as fd: + for chunk in response.iter_content(chunk_size=128): + fd.write(chunk) + + with zipfile.ZipFile(save_path, 'r') as zipObj: + zipObj.extract('msedgedriver.exe', path=None, pwd=None) + +def installGeckoDriver(): + driver_path = pathlib.Path("geckodriver.exe") + if driver_path.is_file(): + pass + else: + response = requests.get('https://github.com/mozilla/geckodriver/releases/download/v0.27.0/geckodriver-v0.27.0-win64.zip', stream=True) + + save_path = pathlib.Path.cwd().joinpath('temp', 'geckodriver-v0.27.0-win64.zip') + with open(save_path, 'wb') as fd: + for chunk in response.iter_content(chunk_size=128): + fd.write(chunk) + + with zipfile.ZipFile(save_path, 'r') as zipObj: + zipObj.extract('geckodriver.exe', path=None, pwd=None) + +def main(): + + installEdgeDriver() + installGeckoDriver() + + reports = [] + + scrapers = [] + scrapers.append(ak_scraper) + scrapers.append(al_scraper) + scrapers.append(ar_scraper) + scrapers.append(ca_scraper) + scrapers.append(co_scraper) + scrapers.append(ct_scraper) + scrapers.append(de_scraper) + scrapers.append(fl_scraper) + scrapers.append(ga_scraper) + scrapers.append(id_scraper) + scrapers.append(in_scraper) + scrapers.append(il_scraper) + scrapers.append(la_scraper) + scrapers.append(mi_scraper) + scrapers.append(mn_scraper) + scrapers.append(ms_scraper) + scrapers.append(mo_scraper) + scrapers.append(mt_scraper) + scrapers.append(ne_scraper) + scrapers.append(nv_scraper) + scrapers.append(nm_scraper) + scrapers.append(nyc_scraper) + scrapers.append(nc_scraper) + scrapers.append(oh_scraper) + scrapers.append(ok_scraper) + scrapers.append(sc_scraper) + scrapers.append(tn_scraper) + scrapers.append(tx_scraper) + scrapers.append(va_scraper) + scrapers.append(vt_scraper) + scrapers.append(wi_scraper) + + + for scraper in scrapers: + print('Starting scrape for' , scraper.STATE, '...') + report = scraper.scraper() + print(scraper.STATE, ' report generated at ', report.timestamp) + reports.append(report) + + counties = [] + for report in reports: + for county in report.counties: + counties.append(county) + + # The following section writes the final CSV output containing all counties for all states + + # First, create an /output folder that will store the final .csv containing all the counties + outputpath = 'output' + if not os.path.exists(outputpath): + os.makedirs(outputpath) + + # Second, determine the filename to use - we'll include a timestamp in the filename itself that explains when the file was generated by our Python 3 code + filename = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") + '_all_counties.csv' + filepath = pathlib.Path.cwd().joinpath('output', filename) + + # Third, do the actual writing of the state reports to a CSV file + with open(filepath, 'w', newline='') as csv_file: + writer = csv.writer(csv_file) + + for county in counties: + writer.writerow([ county.state, county.county, county.confirmed, county.deaths, county.hospitalizations, county.timestamp ]) + +main() \ No newline at end of file diff --git a/src/mi_scraper.py b/src/mi_scraper.py new file mode 100644 index 0000000..4843713 --- /dev/null +++ b/src/mi_scraper.py @@ -0,0 +1,47 @@ +import requests, openpyxl, io, csv, datetime +import county_report, state_report + +STATE_ABBR = 'MI' +STATE = 'Michigan' +URL = 'https://www.michigan.gov/documents/coronavirus/Cases_and_Deaths_by_County_2020-07-24_697248_7.xlsx' + +def scraper(): + # make an HTTP web request to get the MI XLSX file + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + data = io.BytesIO(response.content) + + wb = openpyxl.load_workbook(filename=data, read_only=True, data_only=True) + + sheet = wb.worksheets[0] + + counties = [] + + for i in range(2, 169): + rowCount = str(i) + status = sheet['B' + rowCount].value + if status == 'Confirmed': + county = sheet['A' + rowCount].value + confirmed = sheet['C' + rowCount].value + deaths = sheet['D' + rowCount].value + + county = county_report.CountyReport(STATE, county, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) + counties.append(county) # append the countyReport to our list of counties + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) \ No newline at end of file diff --git a/src/mn_scraper.py b/src/mn_scraper.py new file mode 100644 index 0000000..771b674 --- /dev/null +++ b/src/mn_scraper.py @@ -0,0 +1,49 @@ +import requests, bs4, datetime +import county_report, state_report + +STATE_ABBR = 'MN' +STATE = 'Minnesota' + +def scraper(): + # make an HTTP web request to get the source information + response = requests.get('https://www.health.state.mn.us/diseases/coronavirus/situation.html') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + table = bs4.BeautifulSoup(response.text, features="html.parser").select('#maptable tr') + + counties = [] + + for item in table: + + row = item.find_all('td') + + if len(row) == 0: + continue + + county_name = row[0].text + confirmed = int(row[1].text.replace(',', '')) + deaths = int(row[2].text.replace(',', '')) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/mo_scraper.py b/src/mo_scraper.py new file mode 100644 index 0000000..24ce13e --- /dev/null +++ b/src/mo_scraper.py @@ -0,0 +1,46 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'MO' +STATE = 'Missouri' + +def scraper(): + # make an HTTP web request to get the data + response = requests.get('https://services6.arcgis.com/Bd4MACzvEukoZ9mR/arcgis/rest/services/county_MOHSIS_map/FeatureServer/0/query?where=1%3D1&outFields=OBJECTID,NAME,NAME2,CASES,DEATHS&returnGeometry=false&outSR=4326&f=json') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + features = jsonPayload['features'] + + counties = [] + + for feature in features: + attribute = feature['attributes'] + + county_name = attribute['NAME'] + confirmed = int(attribute['CASES']) + deaths = int(attribute['DEATHS']) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/ms_scraper.py b/src/ms_scraper.py new file mode 100644 index 0000000..8b08dd2 --- /dev/null +++ b/src/ms_scraper.py @@ -0,0 +1,46 @@ +import requests, bs4, datetime +import county_report, state_report + +STATE_ABBR = 'MS' +STATE = 'Mississippi' + +def scraper(): + # make an HTTP web request to get the source information + response = requests.get('https://msdh.ms.gov/msdhsite/_static/14,0,420.html') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + table = bs4.BeautifulSoup(response.text, features="html.parser").select('#msdhTotalCovid-19Cases tbody tr') + + counties = [] + + for item in table: + + row = item.find_all('td') + + county_name = row[0].text + confirmed = int(row[1].text.replace(',', '').replace('*', '')) + deaths = int(row[2].text.replace(',', '').replace('*', '')) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/mt_scraper.py b/src/mt_scraper.py new file mode 100644 index 0000000..b909200 --- /dev/null +++ b/src/mt_scraper.py @@ -0,0 +1,59 @@ +import requests, bs4, datetime +import county_report, state_report + +STATE_ABBR = 'MT' +STATE = 'Montana' + +def scraper(): + # make an HTTP web request to get the source information + response = requests.get('https://dphhs.mt.gov/publichealth/cdepi/diseases/coronavirusmt/demographics') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + soup = bs4.BeautifulSoup(response.text, features="html.parser") + + table = soup.find_all("table", attrs={"summary" : "Cases by County"}) + + counties = [] + + for item in table[0].find_all('tr'): + + row = item.find_all('td') + + if len(row) == 0: + continue + + county_name = row[0].text + + if county_name == 'Total': + continue + + casesStr = row[1].text + deathsStr = row[2].text + + if len(casesStr) == 0 or casesStr == '' or casesStr == '\xa0' or casesStr == '\xa0\n\t': + casesStr = '0' + + if len(deathsStr) == 0 or casesStr == '' or deathsStr == '\xa0' or deathsStr == '\xa0\n\t': + deathsStr = '0' + + confirmed = int(casesStr) + deaths = int(deathsStr) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) diff --git a/src/nc_scraper.py b/src/nc_scraper.py new file mode 100644 index 0000000..efb6abb --- /dev/null +++ b/src/nc_scraper.py @@ -0,0 +1,77 @@ +import requests, json, io, datetime, pathlib, sys, time, os, csv +from io import StringIO +import county_report, state_report +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +STATE_ABBR = 'NC' +STATE = 'North Carolina' + +URL = 'https://public.tableau.com/views/NCDHHS_COVID-19_DataDownload/CountyCasesandDeaths?%3Aembed=y&%3B%3AshowVizHome=no&%3B%3Ahost_url=https%3A%2F%2Fpublic.tableau.com%2F&%3B%3Aembed_code_version=3&%3B%3Atabs=yes&%3B%3Atoolbar=no&%3B%3Aanimate_transition=yes&%3B%3Adisplay_static_image=no&%3B%3Adisplay_spinner=no&%3B%3Adisplay_overlay=yes&%3B%3Adisplay_count=no&%3Bpublish=yes&%3B%3AloadOrderID=0' + +def scraper(): + counties = [] + + # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ + + try: + + browser = webdriver.Edge("msedgedriver.exe") + browser.get(URL) + + file_path = pathlib.Path.home().joinpath('Downloads', 'TABLE_COUNTY.csv') + + if os.path.isfile(file_path): + print(" FAILED on ", STATE, " : Please delete ", file_path, " and start the process over. This file must not exist prior to running the scrape operation.") + + download_link = browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[2]/div[1]/div[2]/div[5]') + download_link.click() + + crosstab_link = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[6]/div/div/div/div/div[2]/div/button[3]'))) + crosstab_link.click() + + csv_link = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[7]/div/div/div/div/div[2]/div/div[2]/div[2]/div/label[2]'))) + csv_link.click() + + download_button = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[7]/div/div/div/div/div[2]/div/div[3]/button'))) + download_button.click() + time.sleep(2) + + with open(file_path, 'rt', encoding='utf-16-le') as file_contents: + data = file_contents.read() + infile = StringIO(data) + + with open(file_path) as csv_file: + csv_reader = csv.reader(data.splitlines(), delimiter='\t', quotechar='"') + + for row in csv_reader: + # print(row) + county_name = row[0] + if county_name == 'County' or row[1] == 'Cases' or row[2] == 'Deaths': + continue + + confirmed = row[1].replace(',', '') + deaths = row[2].replace(',', '') + + county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) + counties.append(county) + + except: + print("Unexpected error:", sys.exc_info()[0]) + + browser.quit() + + os.remove(file_path) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport \ No newline at end of file diff --git a/src/ne_scraper.py b/src/ne_scraper.py new file mode 100644 index 0000000..04d849b --- /dev/null +++ b/src/ne_scraper.py @@ -0,0 +1,51 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'NE' +STATE = 'Nebraska' + +def scraper(): + # make an HTTP web request to get the data + response = requests.get('https://services1.arcgis.com/0MSEUqKaxRlEPj5g/arcgis/rest/services/ncov_cases_US/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=false&outSR=4326&f=json') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + features = jsonPayload['features'] + + counties = [] + + for feature in features: + attribute = feature['attributes'] + + state_name = attribute['Province_State'] + + if state_name != 'Nebraska': + continue + + county_name = attribute['Admin2'] + confirmed = int(attribute['Confirmed']) + deaths = int(attribute['Deaths']) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/nm_scraper.py b/src/nm_scraper.py new file mode 100644 index 0000000..c53bce8 --- /dev/null +++ b/src/nm_scraper.py @@ -0,0 +1,50 @@ +import requests, json, io, datetime, pathlib, sys +import county_report, state_report +from selenium import webdriver + +STATE_ABBR = 'NM' +STATE = 'New Mexico' + +URL = 'https://cvprovider.nmhealth.org/public-dashboard.html' + +def get_row_data(table): + for row in table: + yield [td.text for td in row.find_elements_by_xpath(".//td")] + +def scraper(): + counties = [] + + # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ + + browser = webdriver.Edge("msedgedriver.exe") + browser.get(URL) + + try: + counties_link = browser.find_element_by_id('open-counties-table-modal') + counties_link.click() + + rootCountyDiv = browser.find_elements_by_class_name('counties-table') + htmlRows = rootCountyDiv[0].find_elements_by_xpath(".//tbody/tr") + + rows = get_row_data(htmlRows) + + for row in rows: + county_name = row[0] + confirmed = int(row[1]) + deaths = int(row[2]) + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + except: + print("Unexpected error:", sys.exc_info()[0]) + + browser.quit() + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport \ No newline at end of file diff --git a/src/nv_scraper.py b/src/nv_scraper.py new file mode 100644 index 0000000..e139eeb --- /dev/null +++ b/src/nv_scraper.py @@ -0,0 +1,69 @@ +# See https://app.powerbigov.us/view?r=eyJrIjoiMjA2ZThiOWUtM2FlNS00MGY5LWFmYjUtNmQwNTQ3Nzg5N2I2IiwidCI6ImU0YTM0MGU2LWI4OWUtNGU2OC04ZWFhLTE1NDRkMjcwMzk4MCJ9. +# This is a weird format of Json and it might be better to use a browser-based scrape instead of a Json download... + +import requests, json, io, datetime, pathlib +import county_report, state_report + +STATE_ABBR = 'NV' +STATE = 'Nevada' + +def scraper(): + + payload = '' + + filepath = pathlib.Path.cwd().joinpath('config', 'nv_post_body.json') + with open(filepath, 'r') as file: + payload = file.read().replace('\n', '') + + # make an HTTP web request to get the data + response = requests.post('https://wabi-us-gov-iowa-api.analysis.usgovcloudapi.net/public/reports/querydata?synchronous=true', + data=payload) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + features = jsonPayload['results'][0]['result']['data']['dsr']['DS'][0]['PH'][0]['DM0'] + + counties = [] + + for feature in features: + + if 'S' in feature: + continue + + county_object = feature['C'] + has_R = 'R' in feature + + deaths = 0 + + cases_index = 3 + if has_R: + cases_index = 2 + else: + deaths = int(county_object[1]) + + county_name = county_object[0] + confirmed = int(county_object[cases_index]) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/nyc_scraper.py b/src/nyc_scraper.py new file mode 100644 index 0000000..b622a77 --- /dev/null +++ b/src/nyc_scraper.py @@ -0,0 +1,48 @@ +import requests, io, csv, datetime +import county_report, state_report + +STATE_ABBR = 'NYC' +STATE = 'New York City' + +def scraper(): + # make an HTTP web request to get the data + response = requests.get('https://raw.githubusercontent.com/nychealth/coronavirus-data/master/by-boro.csv') + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + csvData = response.text + + # read the in-memory string using the 'csv' module so we can iterate over each row + csvReader = csv.reader(csvData.splitlines(), delimiter=',', quotechar='"') + + # create a list that will contain our county data + counties = [] + + # iterate over every row in the CSV + for row in csvReader: + # skip the header row + if row[0] == 'BOROUGH_GROUP': + continue + + county_name = row[0] + confirmed = int(row[4]) + deaths = int(row[6]) + hospitalizations = int(row[5]) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, hospitalizations, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Download failed - HTTP status code ', response.status_code) \ No newline at end of file diff --git a/src/oh_scraper.py b/src/oh_scraper.py new file mode 100644 index 0000000..eec406a --- /dev/null +++ b/src/oh_scraper.py @@ -0,0 +1,69 @@ +import requests, io, csv, datetime +import county_report, state_report + +STATE_ABBR = 'OH' +STATE = 'Ohio' +URL = 'https://coronavirus.ohio.gov/static/COVIDSummaryData.csv' + +def scraper(): + # make an HTTP web request to get the CA CSV file + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + csvData = response.text + + # read the in-memory string using the 'csv' module so we can iterate over each row + csvReader = csv.reader(csvData.splitlines(), delimiter=',', quotechar='"') + + # create a list that will contain our county data + counties = [] + + # iterate over every row in the CSV + for row in csvReader: + + county_name = row[0] + confirmedStr = row[6] + + # skip the header row + if county_name == 'County' or len(county_name) == 0 or confirmedStr == 'Case Count' or county_name == 'Grand Total': + continue + + confirmed = int(confirmedStr.replace(',', '')) + + deathsStr = row[7] + deaths = int(deathsStr.replace(',', '')) + + hospitalizationsStr = row[8] + hospitalizations = int(hospitalizationsStr.replace(',', '')) + + county = findCounty(county_name, counties) + + if county == None: + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, hospitalizations, -1, datetime.datetime.now()) + counties.append(county) # append the countyReport to our list of counties + else: + county.confirmed += confirmed + county.deaths += deaths + county.hospitalizations += hospitalizations + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Download failed - HTTP status code ', response.status_code) + + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county \ No newline at end of file diff --git a/src/ok_scraper.py b/src/ok_scraper.py new file mode 100644 index 0000000..24d6172 --- /dev/null +++ b/src/ok_scraper.py @@ -0,0 +1,56 @@ +import requests, io, csv, datetime +import county_report, state_report + +STATE_ABBR = 'OK' +STATE = 'Oklahoma' +URL = 'https://storage.googleapis.com/ok-covid-gcs-public-download/oklahoma_cases_county.csv' + +def scraper(): + # make an HTTP web request to get the OK CSV file + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + csvData = response.text + + # read the in-memory string using the 'csv' module so we can iterate over each row + csvReader = csv.reader(csvData.splitlines(), delimiter=',', quotechar='"') + + # create a list that will contain our county data + counties = [] + + # iterate over every row in the CSV + for row in csvReader: + + county_name = row[0] + + # skip the header row + if row[0] == 'County': + continue + + if county_name == '' or len(county_name) == 0: + continue + + confirmedStr = row[1] + confirmed = int(confirmedStr) + + deathsStr = row[2] + deaths = int(deathsStr) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) # append the countyReport to our list of counties + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Download failed - HTTP status code ', response.status_code) diff --git a/src/sc_scraper.py b/src/sc_scraper.py new file mode 100644 index 0000000..1a62c26 --- /dev/null +++ b/src/sc_scraper.py @@ -0,0 +1,71 @@ +import requests, json, io, datetime, pathlib, sys, time, os, csv +from io import StringIO +import county_report, state_report +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +STATE_ABBR = 'SC' +STATE = 'South Carolina' + +URL = 'https://www.arcgis.com/home/webmap/viewer.html?url=https://services2.arcgis.com/XZg2efAbaieYAXmu/ArcGIS/rest/services/COVID19_SharingView/FeatureServer/0&source=sd' + +def get_row_data(table): + for row in table: + yield [td.text for td in row.find_elements_by_xpath(".//td")] + +def scraper(): + counties = [] + + # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ + + try: + + browser = webdriver.Edge("msedgedriver.exe") + browser.get(URL) + + time.sleep(1) + + show_table_link = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[3]/div[3]/div/div/div[3]/div[2]/div/div[1]/div[2]/div[1]/div[1]/div/div[2]/span'))) + show_table_link.click() + + time.sleep(1) + + county_div = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[3]/div[5]/div[4]/div[1]/div/div/div/div[1]/div/div/div[2]/div/div[2]/div'))) + + county_div_rows = county_div.find_elements_by_xpath('.//div[@role="row"]') + + # SC puts its county level data into lots of
elements, with one
per county. Each
has its own single-row that contains the county data. Thus, we + # have some extra stuff to do to make this work right. + for div_row in county_div_rows: + county_table = div_row.find_element_by_xpath('.//table') + htmlRows = county_table.find_elements_by_xpath(".//tr") + rows = get_row_data(htmlRows) + + for row in rows: + county_name = row[0] + + if county_name == 'Unknown': + continue + + confirmed = int(row[3].replace(',', '')) + deaths = int(row[4].replace(',', '')) + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) + + except: + print("Unexpected error:", sys.exc_info()[0]) + + browser.quit() + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport \ No newline at end of file diff --git a/src/state_report.py b/src/state_report.py new file mode 100644 index 0000000..f5ef92e --- /dev/null +++ b/src/state_report.py @@ -0,0 +1,23 @@ +class StateReport: + state = '' + stateAbbreviation = '' + counties = [] + confirmed = -1 + deaths = -1 + hospitalizations = -1 + timestamp = '' + + def __init__(self, state, stateAbbreviation, counties, timestamp): + self.state = state + self.stateAbbreviation = stateAbbreviation + self.counties = counties + self.timestamp = timestamp + + self.confirmed = 0 + self.deaths = 0 + self.hospitalizations = 0 + + for county in counties: + self.confirmed += county.confirmed + self.deaths += county.deaths + self.hospitalizations += county.hospitalizations \ No newline at end of file diff --git a/src/tn_scraper.py b/src/tn_scraper.py new file mode 100644 index 0000000..390f5b7 --- /dev/null +++ b/src/tn_scraper.py @@ -0,0 +1,75 @@ +import requests, openpyxl, io, csv, datetime, os, pathlib +import county_report, state_report + +STATE_ABBR = 'TN' +STATE = 'Tennessee' +URL = 'https://www.tn.gov/content/dam/tn/health/documents/cedep/novel-coronavirus/datasets/Public-Dataset-County-New.XLSX' + +def scraper(): + # make an HTTP web request to get the MI XLSX file + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + temppath = 'temp' + if not os.path.exists(temppath): + os.makedirs(temppath) + + tempfilename = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") + '_temp_' + STATE_ABBR + '.xlsx' + tempfilepath = pathlib.Path.cwd().joinpath('temp', tempfilename) + + with open(tempfilepath, "wb") as file: + file.write(response.content) + + wb = openpyxl.load_workbook(filename=tempfilepath) + + sheet = wb.worksheets[0] + max_rows = sheet.max_row + + counties = [] + countyDictionary = {} + + i = max_rows + + while i > 2: + rowCount = str(i) + county_name = sheet['B' + rowCount].value + + county = findCounty(county_name, countyDictionary) + + if county == None: + + confirmed = int(sheet['E' + rowCount].value) + deaths = int(sheet['P' + rowCount].value) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + counties.append(county) # append the countyReport to our list of counties + countyDictionary[county_name] = county + + i = i - 1 + + # since the above algorithm outputs the counties in reverse-ABC order, let's reverse that so they're in ABC order... + counties = list(reversed(counties)) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + + +def findCounty(county_name, countyDictionary): + if county_name in countyDictionary: + return countyDictionary[county_name] + else: + return None \ No newline at end of file diff --git a/src/tx_scraper.py b/src/tx_scraper.py new file mode 100644 index 0000000..637bc77 --- /dev/null +++ b/src/tx_scraper.py @@ -0,0 +1,59 @@ +import requests, openpyxl, io, csv, datetime, os, pathlib +import county_report, state_report + +STATE_ABBR = 'TX' +STATE = 'Texas' +URL = 'https://dshs.texas.gov/coronavirus/TexasCOVID19DailyCountyFatalityCountData.xlsx' + +def scraper(): + # make an HTTP web request to get the MI XLSX file + response = requests.get(URL) + + counties = [] + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + temppath = 'temp' + if not os.path.exists(temppath): + os.makedirs(temppath) + + tempfilename = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") + '_temp_' + STATE_ABBR + '.xlsx' + tempfilepath = pathlib.Path.cwd().joinpath('temp', tempfilename) + + with open(tempfilepath, "wb") as file: + file.write(response.content) + + wb = openpyxl.load_workbook(filename=tempfilepath) + + sheet = wb.worksheets[0] + max_rows = sheet.max_row + max_cols = sheet.max_column + + for i in range(4, max_rows): + rowCount = str(i) + + county_name = sheet['A' + rowCount].value + + if county_name == 'Unknown' or county_name == 'Total' or len(county_name) == 0: + break + + confirmed = sheet.cell(row=i, column=max_cols).value + + county = county_report.CountyReport(STATE, county_name, (int)(confirmed), -1, -1, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) \ No newline at end of file diff --git a/src/va_scraper.py b/src/va_scraper.py new file mode 100644 index 0000000..4331e39 --- /dev/null +++ b/src/va_scraper.py @@ -0,0 +1,52 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'VA' +STATE = 'Virginia' +URL = 'https://data.virginia.gov/resource/bre9-aqqr.json' + +def scraper(): + # make an HTTP web request to get the VA Json file + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + counties = [] + jsonPayload = json.loads(response.text) + + for item in jsonPayload: + + county_name = item['locality'] + + if findCounty(county_name, counties) == None: + confirmedStr = item['total_cases'] + confirmed = int(confirmedStr) + + deathsStr = item['deaths'] + deaths = int(deathsStr) + + hospitalizationsStr = item['hospitalizations'] + hospitalizations = int(hospitalizationsStr) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, hospitalizations, -1, datetime.datetime.now()) + counties.append(county) + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/vt_scraper.py b/src/vt_scraper.py new file mode 100644 index 0000000..58c82fa --- /dev/null +++ b/src/vt_scraper.py @@ -0,0 +1,62 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'VT' +STATE = 'Vermont' +URL = 'https://services1.arcgis.com/BkFxaEFNwHqX3tAw/arcgis/rest/services/VIEW_EPI_CountyDailyCount_GEO_PUBLIC/FeatureServer/0/query?where=1%3D1&outFields=CNTY,CNTYNAME,Label,C_Total,D_Total,date&outSR=4326&f=json' + +def scraper(): + # make an HTTP web request to get the data + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + features = jsonPayload['features'] + + counties = [] + + for feature in features: + attribute = feature['attributes'] + + county_name = attribute['Label'] + + if county_name == None or len(county_name) == 0 or county_name == '' or county_name == 'Vermont': + continue + + confirmed = int(attribute['C_Total']) + + deaths = 0 + if attribute['D_Total'] != None: + deaths = int(attribute['D_Total']) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) + + existing_county = findCounty(county_name, counties) + if existing_county == None: + counties.append(county) + elif existing_county.confirmed < county.confirmed or existing_county.deaths < county.deaths or existing_county.hospitalizations < county.hospitalizations: + existing_county.confirmed = county.confirmed + existing_county.deaths = county.deaths + existing_county.hospitalizations = county.hospitalizations + + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county diff --git a/src/wi_scraper.py b/src/wi_scraper.py new file mode 100644 index 0000000..55fd4c0 --- /dev/null +++ b/src/wi_scraper.py @@ -0,0 +1,67 @@ +import requests, json, io, datetime +import county_report, state_report + +STATE_ABBR = 'WI' +STATE = 'Wisconsin' +URL = 'https://dhsgis.wi.gov/server/rest/services/DHS_COVID19/COVID19_WI/FeatureServer/10/query?where=1%3D1&outFields=OBJECTID,NAME,POSITIVE,HOSP_YES,DEATHS,DATE&outSR=4326&f=json' + +def scraper(): + # make an HTTP web request to get the data + response = requests.get(URL) + + if response.status_code == requests.codes.ok: + # Success - print to the console that the HTTP request succeeeded + print(' ', STATE_ABBR, ': Downloaded succeeded') + + jsonPayload = json.loads(response.text) + features = jsonPayload['features'] + + counties = [] + + for feature in features: + attribute = feature['attributes'] + + county_name = attribute['NAME'] + + if county_name == None or len(county_name) == 0 or county_name == '' or county_name == 'WI': + continue + + confirmed = int(attribute['POSITIVE']) + + hospitalizations = 0 + if attribute['HOSP_YES'] != None: + hospitalizations = int(attribute['HOSP_YES']) + + deaths = 0 + if attribute['DEATHS'] != None: + deaths = int(attribute['DEATHS']) + + county = county_report.CountyReport(STATE, county_name, confirmed, deaths, hospitalizations, -1, datetime.datetime.now()) + + existing_county = findCounty(county_name, counties) + if existing_county == None: + counties.append(county) + elif existing_county.confirmed < county.confirmed or existing_county.deaths < county.deaths or existing_county.hospitalizations < county.hospitalizations: + existing_county.confirmed = county.confirmed + existing_county.deaths = county.deaths + existing_county.hospitalizations = county.hospitalizations + + + # print the number of counties we processed + print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') + + # build the state-level report object that will include all of the counties + stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) + + # return the state-level report + return stateReport + + + else: + # Fail + print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code) + +def findCounty(county_name, counties): + for county in counties: + if county.county == county_name: + return county