Fix recgov format updates (#14)

The recreation.gov site rewrite was a little more complicated than expected, these changers took a little longer than expected, and some code was touched that's not directly relevant to make things simpler. Overview of changes: - campground.py edited to remove the /availability path from the url field -- it's no longer needed as recreation.gov now displays the availability table on the main page - utils.py -- some functions like the graceful error handling and logging setup can be moved into a utils file so they can be used in modules as well as in the main daemon -- good for dev - daemon.py -- edit to use the utils file for logging/etc. - scrape_availability.py - improved how we enter/validate the start/end dates - improved detecting the results table-loading element - enabled closing recreation.gov tutorial window if it exists - fixed starting up chromium on raspbian (currently the only thing we support)
rmjacobson · Apr 23, 2022 · 5d08378 · 5d08378
1 parent 60f1dbe
commit 5d08378
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 62 deletions.
diff --git a/campground.py b/campground.py
@@ -12,11 +12,11 @@ class Campground():
     Taken from https://github.com/CCInCharge/campsite-checker, has been useful for debug.
     """
     def __init__(self, name="N/A", facility_id=None):
-        self.name = name                                            # name of campground
-        self.id = facility_id                                       # facility ID of campground
-        self.url = f"{RECGOV_BASE_URL}/{facility_id}/availability"  # recreation.gov URL for campground
-        self.available = False                                      # initialize to unavailable
-        self.error_count = 0                                        # initialize parsing error count to 0
+        self.name = name                                # name of campground
+        self.id = facility_id                           # facility ID of campground
+        self.url = f"{RECGOV_BASE_URL}/{facility_id}"   # recreation.gov URL for campground
+        self.available = False                          # initialize to unavailable
+        self.error_count = 0                            # initialize parsing error count to 0
         # self.campsites = {}     # TODO: develop way of storing available specific campsites
 
     def pretty(self):

diff --git a/daemon.py b/daemon.py
@@ -32,11 +32,9 @@
   - https://peter.bourgon.org/blog/2017/02/21/metrics-tracing-and-logging.html
 """
 
-import sys
 from signal import signal, SIGINT
 import json
 import logging
-from logging.handlers import TimedRotatingFileHandler
 import argparse
 import smtplib
 import ssl
@@ -49,39 +47,15 @@
 from scrape_availability import create_selenium_driver, scrape_campground
 from ridb_interface import get_facilities_from_ridb
 from campground import Campground, CampgroundList
+from utils import exit_gracefully, setup_logging
 
-rotating_handler = handler = TimedRotatingFileHandler("logs/recgov.log", when="d", interval=1,  backupCount=5)
-rotating_handler.suffix = "%Y-%m-%d"
-logging.basicConfig(
-    handlers=[rotating_handler],
-    level=logging.INFO,
-    format="[%(asctime)s] %(filename)s:%(lineno)d [%(name)s]%(levelname)s - %(message)s",
-)
 logger = logging.getLogger(__name__)
 
 # set in ~/.virtualenvs/recgov_daemon/bin/postactivate
 GMAIL_USER = os.environ.get("gmail_user")
 GMAIL_PASSWORD = os.environ.get("gmail_password")
 RETRY_WAIT = 300
 
-def exit_gracefully(signal_received, frame, close_this_driver: WebDriver=None):
-    """
-    Handler for SIGINT that will close webdriver carefully if necessary.
-    Ref: https://www.devdungeon.com/content/python-catch-sigint-ctrl-c
-         https://docs.python.org/3/library/signal.html
-
-    :param signal_received: signal object received by handler
-    :param frame: actually have no idea what this is and we never use it...
-    :param driver: Selenium WebDriver to close before exiting
-    :returns: N/A
-    """
-    logger.info("Received CTRL-C/SIGNINT or daemon completed; exiting gracefully/closing WebDriver if initialized.")
-    if close_this_driver is not None:
-        # use quit instead of close to avoid tons of leftover chrome processes
-        # https://stackoverflow.com/questions/15067107/difference-between-webdriver-dispose-close-and-quit
-        close_this_driver.quit()
-    sys.exit(0)
-
 def send_email_alert(available_campgrounds: CampgroundList):
     """
     Send email alert to email address provided by argparse, from email address (and password)
@@ -171,7 +145,8 @@ def compare_availability(selenium_driver: WebDriver, campground_list: Campground
             logger.info("%s (%s) is not available, trying again in %s seconds",
                 campground.name, campground.id, RETRY_WAIT)
 
-        # if campground parsing has errored more than 5 times in a row, remove it from the CampgroundList
+        # if campground parsing has errored more than 5 times in a row
+        # remove it from the CampgroundList so we can stop checking it and failing
         if campground.error_count > 5:
             err_msg = f"Campground errored more than 5 times in a row, removing it from list:\n{campground.pretty()}"
             logger.error(err_msg)
@@ -247,4 +222,5 @@ def run():
     parser.add_argument("--campground_ids", type=parse_id_args,
         help="Comma-separated list of campground facility IDs you want to check (e.g. `233116,231962`).")
     args = parser.parse_args()
+    setup_logging()
     run()
diff --git a/scrape_availability.py b/scrape_availability.py
@@ -7,27 +7,39 @@
 
 import logging
 import traceback
+from signal import signal, SIGINT
 from datetime import datetime, timedelta
+from time import sleep
+from typing import Tuple
 from pandas.core.frame import DataFrame
+from pyvirtualdisplay import Display
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.chrome.webdriver import WebDriver
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
-from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.chrome.service import Service
+from selenium.common.exceptions import NoSuchElementException, TimeoutException
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.utils import ChromeType
 from campground import Campground
+from utils import exit_gracefully, setup_logging
 
 logger = logging.getLogger(__name__)
 
 # tag names needed for html interaction/parsing found via manual inspection of
 # recreation.gov -- DO NOT CHANGE unless recreation.gov changes its layout!
-INPUT_TAG_NAME = "single-date-picker-1"
+START_DATE_INPUT_TAG_NAME = "campground-start-date-calendar"
+START_DATE_ERROR_TAG_NAME = "campground-start-date-calendar-error"
+END_DATE_INPUT_TAG_NAME = "campground-end-date-calendar"
+END_DATE_ERROR_TAG_NAME = "campground-end-date-calendar-error"
 AVAILABILITY_TABLE_TAG_NAME = "availability-table"
-TABLE_LOADING_TAG_CLASS = "rec-table-overlay"
+TABLE_LOADING_TAG_CLASS = "rec-table-overlay-loading"
 CAMP_LOCATION_NAME_ICON = "camp-location-name--icon"
-AVAILABILITY_TABLE_REFRESH_XPATH = """//*[@id="page-body"]/div/div[1]/div[1]/div[3]/div[1]/div[1]/div/div/button[1]"""
+TUTORIAL_CLOSE_BUTTON_XPATH = "/html/body/div[11]/div/div/div/div/div/div/div/button"
 PAGE_LOAD_WAIT = 60
 
 def parse_html_table(table: BeautifulSoup) -> DataFrame:
@@ -87,18 +99,23 @@ def all_dates_available(df: DataFrame, start_date: datetime, num_days: int) -> b
 
     return at_least_one_available
 
-def create_selenium_driver() -> WebDriver:
+def create_selenium_driver(headless: bool=True) -> WebDriver:
     """
     Initialize Selenium WebDriver object and return it to the caller. Do this in a separate
     function to allow driver re-use across rounds of scraping.  Note: the remote debugging port
     option seems to be required for raspberry pi operation: https://stackoverflow.com/a/56638103
 
+    :param headless: create GUI for WebDriver? Testing usage passes in False, defaults to True
     :returns: Selenium WebDriver object
     """
-    chrome_options = webdriver.ChromeOptions()
-    chrome_options.add_argument("--headless")
-    chrome_options.add_argument("--remote-debugging-port=9222")
-    driver = webdriver.Chrome(options=chrome_options)
+    options = Options()
+    options.add_argument("enable-automation")               # necessary for driving Chromium actions
+    if headless:
+        options.add_argument("--headless")
+    options.add_argument("--remote-debugging-port=9222")    # necessary for driving Chromium actions
+    options.binary_location = "/usr/bin/chromium-browser"   # browser is Chromium instead of Chrome
+    driver_path = "/usr/lib/chromium-browser/chromedriver"  # we use custom chromedriver for raspberry
+    driver = webdriver.Chrome(options=options, service=Service(driver_path))
     driver.implicitly_wait(PAGE_LOAD_WAIT)
     return driver
 
@@ -119,6 +136,39 @@ def wait_for_page_element_load(driver: WebDriver, elem_id: str):
         logger.exception("Loading %s element on page took too much time; skipping this load.", elem_id)
         return None
 
+def enter_date_input(date: datetime, date_input):
+    """
+    As of 2022, recreation.gov requires inputting a start and end date separately to refresh the
+    table, so pull out this function so that we can call it for both boxes and avoid code reuse.
+    """
+    date_input.send_keys(date.strftime("%m/%d/%Y"))
+    for _ in range(10):     # backtrack to start of our input date
+        date_input.send_keys(Keys.ARROW_LEFT)
+    for _ in range(10):     # delete default start date
+        date_input.send_keys(Keys.BACKSPACE)
+    date_input.send_keys(Keys.RETURN)
+
+def is_bad_date(driver: WebDriver, element_id) -> Tuple[bool, str]:
+    """
+    Entering the date improperly causes dynamic text to appear right below the input box
+    indicating whether the date is invalid (formatting issue) or unavailable. Read this
+    text and return the appropriate message to the caller.
+
+    NOTE: not currently used, but good to keep here in case the site changes and we need
+    this pattern again.
+    """
+    date_error_msg = driver.find_element(by=By.ID, value=element_id)
+    invalid_str = "not valid"
+    unavailable_str = "not available"
+    logger.info(date_error_msg.text)
+    if date_error_msg is not None:
+        if unavailable_str in date_error_msg.text:
+            return (True, unavailable_str)
+        if invalid_str in date_error_msg.text:
+            return (True, invalid_str)
+        return (True, "new error")
+    return (False, "all good")
+
 def scrape_campground(driver: WebDriver, campground: Campground, start_date: datetime, num_days: int) -> bool:
     """
     Use Selenium WebDriver to load page, input desired start date, identify availability table
@@ -147,27 +197,38 @@ def scrape_campground(driver: WebDriver, campground: Campground, start_date: dat
     try:
         logger.debug("\tGetting campground.url (%s) with driver", campground.url)
         driver.get(campground.url)
+
+        try:        # check for tutorial window and close it if it appears, otherwise table doesn't load correctly
+            tutorial_close_button = driver.find_element(by=By.XPATH, value=TUTORIAL_CLOSE_BUTTON_XPATH)
+            logger.debug("\tClosing tutorial window")
+            tutorial_close_button.click()
+        except NoSuchElementException:
+            logger.debug("\tNo tutorial this time")
+            pass    # we don't actually care if tutorial didn't appear, just move on
+
         logger.debug("\tFinding input box tag")
-        date_input = wait_for_page_element_load(driver, INPUT_TAG_NAME)
-        if date_input is None:  # if wait for page element load fails -> abandon this check immediately
+        start_date_input = wait_for_page_element_load(driver, START_DATE_INPUT_TAG_NAME)
+        if start_date_input is None:  # if wait for page element load fails -> abandon this check immediately
             return False
-        # date_input = driver.find_element_by_id(INPUT_TAG_NAME)
-        logger.debug("\tSending new date with send_keys")
-        date_input.send_keys(start_date.strftime("%m/%d/%Y"))
-        for _ in range(10):     # backtrack to start of our input date
-            date_input.send_keys(Keys.ARROW_LEFT)
-        for _ in range(10):     # delete default start date
-            date_input.send_keys(Keys.BACKSPACE)
-        date_input.send_keys(Keys.RETURN)
-        # manually click refresh table button to ensure valid table data
-        # (if you don't do this every cell might be filled with 'x')
-        refresh_table = driver.find_elements_by_xpath(AVAILABILITY_TABLE_REFRESH_XPATH)[0]
-        refresh_table.click()
+        logger.debug("\tInputting start/end dates with send_keys")
+        enter_date_input(start_date, start_date_input)
+
+        end_date = start_date + timedelta(days=num_days)
+        end_date_input = wait_for_page_element_load(driver, END_DATE_INPUT_TAG_NAME)
+        if end_date_input is None:  # if wait for page element load fails -> abandon this check immediately
+            return False
+        enter_date_input(end_date, end_date_input)
 
         # wait for table refresh/loading spinning wheel to disappear, otherwise table contents are gibberish/NaN
         # https://stackoverflow.com/a/29084080 -- wait for element to *not* be visible
-        loading_tag = driver.find_element_by_class_name(TABLE_LOADING_TAG_CLASS)
-        WebDriverWait(driver, PAGE_LOAD_WAIT).until(EC.invisibility_of_element(loading_tag))
+        # https://stackoverflow.com/a/51884408 -- wait for element not to be visible even though it
+        # may already be invisible
+        # https://stackoverflow.com/a/45420111 -- temporarily kill implicit waits to make explicit wait work corectly
+        driver.implicitly_wait(0)
+        WebDriverWait(driver, PAGE_LOAD_WAIT).until(EC.invisibility_of_element_located(
+            (By.CLASS_NAME, TABLE_LOADING_TAG_CLASS)))
+        driver.implicitly_wait(PAGE_LOAD_WAIT)
+
         logger.debug("\tFinding availability table tag")
         availability_table = wait_for_page_element_load(driver, AVAILABILITY_TABLE_TAG_NAME)
         if availability_table is None:  # if wait for page element load fails -> abandon this check immediately
@@ -188,17 +249,22 @@ def run():
     """
     Runs scrape availability module for specific values, should be used for debugging only.
     """
+    signal(SIGINT, exit_gracefully)     # add custom handler for SIGINT/CTRL-C
     # kirk_creek = "https://www.recreation.gov/camping/campgrounds/233116/availability"
     # kirk_start_date_str = "09/17/2021"
-    mcgill = "https://www.recreation.gov/camping/campgrounds/231962/availability"
-    mcgill_start_date_str = "05/31/2021"
+    # mcgill = "https://www.recreation.gov/camping/campgrounds/231962/availability"
+    mcgill_start_date = datetime.strptime("05/31/2022", "%m/%d/%Y")
     num_days = 2
+    mcgill_campground = Campground(name="McGill", facility_id="231962")
 
-    driver = create_selenium_driver()
-    if scrape_campground(driver, mcgill, mcgill_start_date_str, num_days):
+    driver = create_selenium_driver(headless=True)
+    if scrape_campground(driver, mcgill_campground, mcgill_start_date, num_days):
         logger.info("WE HAVE SOMETHING AVAILABLE!")
     else:
         logger.info("sad")
+    sleep(10000000)
+    driver.quit()
 
 if __name__ == "__main__":
+    setup_logging()
     run()
diff --git a/utils.py b/utils.py
@@ -0,0 +1,63 @@
+"""
+utils.py
+
+fill this out later
+"""
+
+import sys
+import logging
+from logging.handlers import TimedRotatingFileHandler
+from selenium.webdriver.chrome.webdriver import WebDriver
+
+logger = logging.getLogger(__name__)
+
+def exit_gracefully(signal_received, frame, close_this_driver: WebDriver=None):
+    """
+    Handler for SIGINT that will close webdriver carefully if necessary.
+    Ref: https://www.devdungeon.com/content/python-catch-sigint-ctrl-c
+         https://docs.python.org/3/library/signal.html
+
+    :param signal_received: signal object received by handler
+    :param frame: actually have no idea what this is and we never use it...
+    :param driver: Selenium WebDriver to close before exiting
+    :returns: N/A
+    """
+    logger.info("Received CTRL-C/SIGNINT or daemon completed; exiting gracefully/closing WebDriver if initialized.")
+    if close_this_driver is not None:
+        # use quit instead of close to avoid tons of leftover chrome processes
+        # https://stackoverflow.com/questions/15067107/difference-between-webdriver-dispose-close-and-quit
+        close_this_driver.quit()
+        logger.info("WebDriver Quit Successfully")
+    sys.exit(0)
+
+def set_low_network_quality(driver: WebDriver) -> None:
+    """
+    Set WebDriver to simulate low network quality -- 5ms additional latency, only 500kb
+    throughput. Mostly used when testing the load actions of the availability table; sometimes
+    there is a web element that shows up and spins for a while while the wait happens and it
+    can be difficult to find the name for that without the appropriate delays set.
+
+    Should never be used in production code, but left here for future testing needs.
+    """
+    latency_delay_ms = 5
+    download_throughput_kb = 500
+    upload_throughput_kb = 500
+    driver.set_network_conditions(
+        offline=False,
+        latency=latency_delay_ms,
+        download_throughput=download_throughput_kb,
+        upload_throughput=upload_throughput_kb)
+
+def setup_logging() -> None:
+    """
+    Set logging for whole project. This function is called from wherever the program
+    is invoked. Could be from different places given development of different components
+    separately.
+    """
+    rotating_handler = TimedRotatingFileHandler("logs/recgov.log", when="d", interval=1,  backupCount=5)
+    rotating_handler.suffix = "%Y-%m-%d"
+    logging.basicConfig(
+        handlers=[rotating_handler],
+        level=logging.INFO,
+        format="[%(asctime)s] %(filename)s:%(lineno)d [%(name)s]%(levelname)s - %(message)s",
+    )