Skip to content

Commit

Permalink
Fix recgov format updates (#14)
Browse files Browse the repository at this point in the history
The recreation.gov site rewrite was a little more complicated than expected,
these changers took a little longer than expected, and some code was touched
that's not directly relevant to make things simpler. Overview of changes:

- campground.py edited to remove the /availability path from the
  url field -- it's no longer needed as recreation.gov now
  displays the availability table on the main page
- utils.py -- some functions like the graceful error handling and
  logging setup can be moved into a utils file so they can be used
  in modules as well as in the main daemon -- good for dev
- daemon.py -- edit to use the utils file for logging/etc.
- scrape_availability.py
    - improved how we enter/validate the start/end dates
    - improved detecting the results table-loading element
    - enabled closing recreation.gov tutorial window if it exists
- fixed starting up chromium on raspbian (currently the only thing
  we support)
  • Loading branch information
rmjacobson authored Apr 23, 2022
1 parent 60f1dbe commit 5d08378
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 62 deletions.
10 changes: 5 additions & 5 deletions campground.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ class Campground():
Taken from https://github.com/CCInCharge/campsite-checker, has been useful for debug.
"""
def __init__(self, name="N/A", facility_id=None):
self.name = name # name of campground
self.id = facility_id # facility ID of campground
self.url = f"{RECGOV_BASE_URL}/{facility_id}/availability" # recreation.gov URL for campground
self.available = False # initialize to unavailable
self.error_count = 0 # initialize parsing error count to 0
self.name = name # name of campground
self.id = facility_id # facility ID of campground
self.url = f"{RECGOV_BASE_URL}/{facility_id}" # recreation.gov URL for campground
self.available = False # initialize to unavailable
self.error_count = 0 # initialize parsing error count to 0
# self.campsites = {} # TODO: develop way of storing available specific campsites

def pretty(self):
Expand Down
32 changes: 4 additions & 28 deletions daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,9 @@
- https://peter.bourgon.org/blog/2017/02/21/metrics-tracing-and-logging.html
"""

import sys
from signal import signal, SIGINT
import json
import logging
from logging.handlers import TimedRotatingFileHandler
import argparse
import smtplib
import ssl
Expand All @@ -49,39 +47,15 @@
from scrape_availability import create_selenium_driver, scrape_campground
from ridb_interface import get_facilities_from_ridb
from campground import Campground, CampgroundList
from utils import exit_gracefully, setup_logging

rotating_handler = handler = TimedRotatingFileHandler("logs/recgov.log", when="d", interval=1, backupCount=5)
rotating_handler.suffix = "%Y-%m-%d"
logging.basicConfig(
handlers=[rotating_handler],
level=logging.INFO,
format="[%(asctime)s] %(filename)s:%(lineno)d [%(name)s]%(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)

# set in ~/.virtualenvs/recgov_daemon/bin/postactivate
GMAIL_USER = os.environ.get("gmail_user")
GMAIL_PASSWORD = os.environ.get("gmail_password")
RETRY_WAIT = 300

def exit_gracefully(signal_received, frame, close_this_driver: WebDriver=None):
"""
Handler for SIGINT that will close webdriver carefully if necessary.
Ref: https://www.devdungeon.com/content/python-catch-sigint-ctrl-c
https://docs.python.org/3/library/signal.html
:param signal_received: signal object received by handler
:param frame: actually have no idea what this is and we never use it...
:param driver: Selenium WebDriver to close before exiting
:returns: N/A
"""
logger.info("Received CTRL-C/SIGNINT or daemon completed; exiting gracefully/closing WebDriver if initialized.")
if close_this_driver is not None:
# use quit instead of close to avoid tons of leftover chrome processes
# https://stackoverflow.com/questions/15067107/difference-between-webdriver-dispose-close-and-quit
close_this_driver.quit()
sys.exit(0)

def send_email_alert(available_campgrounds: CampgroundList):
"""
Send email alert to email address provided by argparse, from email address (and password)
Expand Down Expand Up @@ -171,7 +145,8 @@ def compare_availability(selenium_driver: WebDriver, campground_list: Campground
logger.info("%s (%s) is not available, trying again in %s seconds",
campground.name, campground.id, RETRY_WAIT)

# if campground parsing has errored more than 5 times in a row, remove it from the CampgroundList
# if campground parsing has errored more than 5 times in a row
# remove it from the CampgroundList so we can stop checking it and failing
if campground.error_count > 5:
err_msg = f"Campground errored more than 5 times in a row, removing it from list:\n{campground.pretty()}"
logger.error(err_msg)
Expand Down Expand Up @@ -247,4 +222,5 @@ def run():
parser.add_argument("--campground_ids", type=parse_id_args,
help="Comma-separated list of campground facility IDs you want to check (e.g. `233116,231962`).")
args = parser.parse_args()
setup_logging()
run()
124 changes: 95 additions & 29 deletions scrape_availability.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,39 @@

import logging
import traceback
from signal import signal, SIGINT
from datetime import datetime, timedelta
from time import sleep
from typing import Tuple
from pandas.core.frame import DataFrame
from pyvirtualdisplay import Display
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from webdriver_manager.utils import ChromeType
from campground import Campground
from utils import exit_gracefully, setup_logging

logger = logging.getLogger(__name__)

# tag names needed for html interaction/parsing found via manual inspection of
# recreation.gov -- DO NOT CHANGE unless recreation.gov changes its layout!
INPUT_TAG_NAME = "single-date-picker-1"
START_DATE_INPUT_TAG_NAME = "campground-start-date-calendar"
START_DATE_ERROR_TAG_NAME = "campground-start-date-calendar-error"
END_DATE_INPUT_TAG_NAME = "campground-end-date-calendar"
END_DATE_ERROR_TAG_NAME = "campground-end-date-calendar-error"
AVAILABILITY_TABLE_TAG_NAME = "availability-table"
TABLE_LOADING_TAG_CLASS = "rec-table-overlay"
TABLE_LOADING_TAG_CLASS = "rec-table-overlay-loading"
CAMP_LOCATION_NAME_ICON = "camp-location-name--icon"
AVAILABILITY_TABLE_REFRESH_XPATH = """//*[@id="page-body"]/div/div[1]/div[1]/div[3]/div[1]/div[1]/div/div/button[1]"""
TUTORIAL_CLOSE_BUTTON_XPATH = "/html/body/div[11]/div/div/div/div/div/div/div/button"
PAGE_LOAD_WAIT = 60

def parse_html_table(table: BeautifulSoup) -> DataFrame:
Expand Down Expand Up @@ -87,18 +99,23 @@ def all_dates_available(df: DataFrame, start_date: datetime, num_days: int) -> b

return at_least_one_available

def create_selenium_driver() -> WebDriver:
def create_selenium_driver(headless: bool=True) -> WebDriver:
"""
Initialize Selenium WebDriver object and return it to the caller. Do this in a separate
function to allow driver re-use across rounds of scraping. Note: the remote debugging port
option seems to be required for raspberry pi operation: https://stackoverflow.com/a/56638103
:param headless: create GUI for WebDriver? Testing usage passes in False, defaults to True
:returns: Selenium WebDriver object
"""
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--remote-debugging-port=9222")
driver = webdriver.Chrome(options=chrome_options)
options = Options()
options.add_argument("enable-automation") # necessary for driving Chromium actions
if headless:
options.add_argument("--headless")
options.add_argument("--remote-debugging-port=9222") # necessary for driving Chromium actions
options.binary_location = "/usr/bin/chromium-browser" # browser is Chromium instead of Chrome
driver_path = "/usr/lib/chromium-browser/chromedriver" # we use custom chromedriver for raspberry
driver = webdriver.Chrome(options=options, service=Service(driver_path))
driver.implicitly_wait(PAGE_LOAD_WAIT)
return driver

Expand All @@ -119,6 +136,39 @@ def wait_for_page_element_load(driver: WebDriver, elem_id: str):
logger.exception("Loading %s element on page took too much time; skipping this load.", elem_id)
return None

def enter_date_input(date: datetime, date_input):
"""
As of 2022, recreation.gov requires inputting a start and end date separately to refresh the
table, so pull out this function so that we can call it for both boxes and avoid code reuse.
"""
date_input.send_keys(date.strftime("%m/%d/%Y"))
for _ in range(10): # backtrack to start of our input date
date_input.send_keys(Keys.ARROW_LEFT)
for _ in range(10): # delete default start date
date_input.send_keys(Keys.BACKSPACE)
date_input.send_keys(Keys.RETURN)

def is_bad_date(driver: WebDriver, element_id) -> Tuple[bool, str]:
"""
Entering the date improperly causes dynamic text to appear right below the input box
indicating whether the date is invalid (formatting issue) or unavailable. Read this
text and return the appropriate message to the caller.
NOTE: not currently used, but good to keep here in case the site changes and we need
this pattern again.
"""
date_error_msg = driver.find_element(by=By.ID, value=element_id)
invalid_str = "not valid"
unavailable_str = "not available"
logger.info(date_error_msg.text)
if date_error_msg is not None:
if unavailable_str in date_error_msg.text:
return (True, unavailable_str)
if invalid_str in date_error_msg.text:
return (True, invalid_str)
return (True, "new error")
return (False, "all good")

def scrape_campground(driver: WebDriver, campground: Campground, start_date: datetime, num_days: int) -> bool:
"""
Use Selenium WebDriver to load page, input desired start date, identify availability table
Expand Down Expand Up @@ -147,27 +197,38 @@ def scrape_campground(driver: WebDriver, campground: Campground, start_date: dat
try:
logger.debug("\tGetting campground.url (%s) with driver", campground.url)
driver.get(campground.url)

try: # check for tutorial window and close it if it appears, otherwise table doesn't load correctly
tutorial_close_button = driver.find_element(by=By.XPATH, value=TUTORIAL_CLOSE_BUTTON_XPATH)
logger.debug("\tClosing tutorial window")
tutorial_close_button.click()
except NoSuchElementException:
logger.debug("\tNo tutorial this time")
pass # we don't actually care if tutorial didn't appear, just move on

logger.debug("\tFinding input box tag")
date_input = wait_for_page_element_load(driver, INPUT_TAG_NAME)
if date_input is None: # if wait for page element load fails -> abandon this check immediately
start_date_input = wait_for_page_element_load(driver, START_DATE_INPUT_TAG_NAME)
if start_date_input is None: # if wait for page element load fails -> abandon this check immediately
return False
# date_input = driver.find_element_by_id(INPUT_TAG_NAME)
logger.debug("\tSending new date with send_keys")
date_input.send_keys(start_date.strftime("%m/%d/%Y"))
for _ in range(10): # backtrack to start of our input date
date_input.send_keys(Keys.ARROW_LEFT)
for _ in range(10): # delete default start date
date_input.send_keys(Keys.BACKSPACE)
date_input.send_keys(Keys.RETURN)
# manually click refresh table button to ensure valid table data
# (if you don't do this every cell might be filled with 'x')
refresh_table = driver.find_elements_by_xpath(AVAILABILITY_TABLE_REFRESH_XPATH)[0]
refresh_table.click()
logger.debug("\tInputting start/end dates with send_keys")
enter_date_input(start_date, start_date_input)

end_date = start_date + timedelta(days=num_days)
end_date_input = wait_for_page_element_load(driver, END_DATE_INPUT_TAG_NAME)
if end_date_input is None: # if wait for page element load fails -> abandon this check immediately
return False
enter_date_input(end_date, end_date_input)

# wait for table refresh/loading spinning wheel to disappear, otherwise table contents are gibberish/NaN
# https://stackoverflow.com/a/29084080 -- wait for element to *not* be visible
loading_tag = driver.find_element_by_class_name(TABLE_LOADING_TAG_CLASS)
WebDriverWait(driver, PAGE_LOAD_WAIT).until(EC.invisibility_of_element(loading_tag))
# https://stackoverflow.com/a/51884408 -- wait for element not to be visible even though it
# may already be invisible
# https://stackoverflow.com/a/45420111 -- temporarily kill implicit waits to make explicit wait work corectly
driver.implicitly_wait(0)
WebDriverWait(driver, PAGE_LOAD_WAIT).until(EC.invisibility_of_element_located(
(By.CLASS_NAME, TABLE_LOADING_TAG_CLASS)))
driver.implicitly_wait(PAGE_LOAD_WAIT)

logger.debug("\tFinding availability table tag")
availability_table = wait_for_page_element_load(driver, AVAILABILITY_TABLE_TAG_NAME)
if availability_table is None: # if wait for page element load fails -> abandon this check immediately
Expand All @@ -188,17 +249,22 @@ def run():
"""
Runs scrape availability module for specific values, should be used for debugging only.
"""
signal(SIGINT, exit_gracefully) # add custom handler for SIGINT/CTRL-C
# kirk_creek = "https://www.recreation.gov/camping/campgrounds/233116/availability"
# kirk_start_date_str = "09/17/2021"
mcgill = "https://www.recreation.gov/camping/campgrounds/231962/availability"
mcgill_start_date_str = "05/31/2021"
# mcgill = "https://www.recreation.gov/camping/campgrounds/231962/availability"
mcgill_start_date = datetime.strptime("05/31/2022", "%m/%d/%Y")
num_days = 2
mcgill_campground = Campground(name="McGill", facility_id="231962")

driver = create_selenium_driver()
if scrape_campground(driver, mcgill, mcgill_start_date_str, num_days):
driver = create_selenium_driver(headless=True)
if scrape_campground(driver, mcgill_campground, mcgill_start_date, num_days):
logger.info("WE HAVE SOMETHING AVAILABLE!")
else:
logger.info("sad")
sleep(10000000)
driver.quit()

if __name__ == "__main__":
setup_logging()
run()
63 changes: 63 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
utils.py
fill this out later
"""

import sys
import logging
from logging.handlers import TimedRotatingFileHandler
from selenium.webdriver.chrome.webdriver import WebDriver

logger = logging.getLogger(__name__)

def exit_gracefully(signal_received, frame, close_this_driver: WebDriver=None):
"""
Handler for SIGINT that will close webdriver carefully if necessary.
Ref: https://www.devdungeon.com/content/python-catch-sigint-ctrl-c
https://docs.python.org/3/library/signal.html
:param signal_received: signal object received by handler
:param frame: actually have no idea what this is and we never use it...
:param driver: Selenium WebDriver to close before exiting
:returns: N/A
"""
logger.info("Received CTRL-C/SIGNINT or daemon completed; exiting gracefully/closing WebDriver if initialized.")
if close_this_driver is not None:
# use quit instead of close to avoid tons of leftover chrome processes
# https://stackoverflow.com/questions/15067107/difference-between-webdriver-dispose-close-and-quit
close_this_driver.quit()
logger.info("WebDriver Quit Successfully")
sys.exit(0)

def set_low_network_quality(driver: WebDriver) -> None:
"""
Set WebDriver to simulate low network quality -- 5ms additional latency, only 500kb
throughput. Mostly used when testing the load actions of the availability table; sometimes
there is a web element that shows up and spins for a while while the wait happens and it
can be difficult to find the name for that without the appropriate delays set.
Should never be used in production code, but left here for future testing needs.
"""
latency_delay_ms = 5
download_throughput_kb = 500
upload_throughput_kb = 500
driver.set_network_conditions(
offline=False,
latency=latency_delay_ms,
download_throughput=download_throughput_kb,
upload_throughput=upload_throughput_kb)

def setup_logging() -> None:
"""
Set logging for whole project. This function is called from wherever the program
is invoked. Could be from different places given development of different components
separately.
"""
rotating_handler = TimedRotatingFileHandler("logs/recgov.log", when="d", interval=1, backupCount=5)
rotating_handler.suffix = "%Y-%m-%d"
logging.basicConfig(
handlers=[rotating_handler],
level=logging.INFO,
format="[%(asctime)s] %(filename)s:%(lineno)d [%(name)s]%(levelname)s - %(message)s",
)

0 comments on commit 5d08378

Please sign in to comment.