From 5d0837849f975ece97bc5005c2bf6889ec2283db Mon Sep 17 00:00:00 2001 From: rmjacobson <36381872+rmjacobson@users.noreply.github.com> Date: Fri, 22 Apr 2022 23:47:33 -0700 Subject: [PATCH] Fix recgov format updates (#14) The recreation.gov site rewrite was a little more complicated than expected, these changers took a little longer than expected, and some code was touched that's not directly relevant to make things simpler. Overview of changes: - campground.py edited to remove the /availability path from the url field -- it's no longer needed as recreation.gov now displays the availability table on the main page - utils.py -- some functions like the graceful error handling and logging setup can be moved into a utils file so they can be used in modules as well as in the main daemon -- good for dev - daemon.py -- edit to use the utils file for logging/etc. - scrape_availability.py - improved how we enter/validate the start/end dates - improved detecting the results table-loading element - enabled closing recreation.gov tutorial window if it exists - fixed starting up chromium on raspbian (currently the only thing we support) --- campground.py | 10 ++-- daemon.py | 32 ++--------- scrape_availability.py | 124 +++++++++++++++++++++++++++++++---------- utils.py | 63 +++++++++++++++++++++ 4 files changed, 167 insertions(+), 62 deletions(-) create mode 100644 utils.py diff --git a/campground.py b/campground.py index d35a252..ef5c85f 100644 --- a/campground.py +++ b/campground.py @@ -12,11 +12,11 @@ class Campground(): Taken from https://github.com/CCInCharge/campsite-checker, has been useful for debug. """ def __init__(self, name="N/A", facility_id=None): - self.name = name # name of campground - self.id = facility_id # facility ID of campground - self.url = f"{RECGOV_BASE_URL}/{facility_id}/availability" # recreation.gov URL for campground - self.available = False # initialize to unavailable - self.error_count = 0 # initialize parsing error count to 0 + self.name = name # name of campground + self.id = facility_id # facility ID of campground + self.url = f"{RECGOV_BASE_URL}/{facility_id}" # recreation.gov URL for campground + self.available = False # initialize to unavailable + self.error_count = 0 # initialize parsing error count to 0 # self.campsites = {} # TODO: develop way of storing available specific campsites def pretty(self): diff --git a/daemon.py b/daemon.py index d32d4c2..18305dc 100644 --- a/daemon.py +++ b/daemon.py @@ -32,11 +32,9 @@ - https://peter.bourgon.org/blog/2017/02/21/metrics-tracing-and-logging.html """ -import sys from signal import signal, SIGINT import json import logging -from logging.handlers import TimedRotatingFileHandler import argparse import smtplib import ssl @@ -49,14 +47,8 @@ from scrape_availability import create_selenium_driver, scrape_campground from ridb_interface import get_facilities_from_ridb from campground import Campground, CampgroundList +from utils import exit_gracefully, setup_logging -rotating_handler = handler = TimedRotatingFileHandler("logs/recgov.log", when="d", interval=1, backupCount=5) -rotating_handler.suffix = "%Y-%m-%d" -logging.basicConfig( - handlers=[rotating_handler], - level=logging.INFO, - format="[%(asctime)s] %(filename)s:%(lineno)d [%(name)s]%(levelname)s - %(message)s", -) logger = logging.getLogger(__name__) # set in ~/.virtualenvs/recgov_daemon/bin/postactivate @@ -64,24 +56,6 @@ GMAIL_PASSWORD = os.environ.get("gmail_password") RETRY_WAIT = 300 -def exit_gracefully(signal_received, frame, close_this_driver: WebDriver=None): - """ - Handler for SIGINT that will close webdriver carefully if necessary. - Ref: https://www.devdungeon.com/content/python-catch-sigint-ctrl-c - https://docs.python.org/3/library/signal.html - - :param signal_received: signal object received by handler - :param frame: actually have no idea what this is and we never use it... - :param driver: Selenium WebDriver to close before exiting - :returns: N/A - """ - logger.info("Received CTRL-C/SIGNINT or daemon completed; exiting gracefully/closing WebDriver if initialized.") - if close_this_driver is not None: - # use quit instead of close to avoid tons of leftover chrome processes - # https://stackoverflow.com/questions/15067107/difference-between-webdriver-dispose-close-and-quit - close_this_driver.quit() - sys.exit(0) - def send_email_alert(available_campgrounds: CampgroundList): """ Send email alert to email address provided by argparse, from email address (and password) @@ -171,7 +145,8 @@ def compare_availability(selenium_driver: WebDriver, campground_list: Campground logger.info("%s (%s) is not available, trying again in %s seconds", campground.name, campground.id, RETRY_WAIT) - # if campground parsing has errored more than 5 times in a row, remove it from the CampgroundList + # if campground parsing has errored more than 5 times in a row + # remove it from the CampgroundList so we can stop checking it and failing if campground.error_count > 5: err_msg = f"Campground errored more than 5 times in a row, removing it from list:\n{campground.pretty()}" logger.error(err_msg) @@ -247,4 +222,5 @@ def run(): parser.add_argument("--campground_ids", type=parse_id_args, help="Comma-separated list of campground facility IDs you want to check (e.g. `233116,231962`).") args = parser.parse_args() + setup_logging() run() diff --git a/scrape_availability.py b/scrape_availability.py index b3c0d10..7dd10eb 100644 --- a/scrape_availability.py +++ b/scrape_availability.py @@ -7,8 +7,12 @@ import logging import traceback +from signal import signal, SIGINT from datetime import datetime, timedelta +from time import sleep +from typing import Tuple from pandas.core.frame import DataFrame +from pyvirtualdisplay import Display from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.webdriver import WebDriver @@ -16,18 +20,26 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By -from selenium.common.exceptions import TimeoutException +from selenium.webdriver.chrome.service import Service +from selenium.common.exceptions import NoSuchElementException, TimeoutException +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.chrome.options import Options +from webdriver_manager.utils import ChromeType from campground import Campground +from utils import exit_gracefully, setup_logging logger = logging.getLogger(__name__) # tag names needed for html interaction/parsing found via manual inspection of # recreation.gov -- DO NOT CHANGE unless recreation.gov changes its layout! -INPUT_TAG_NAME = "single-date-picker-1" +START_DATE_INPUT_TAG_NAME = "campground-start-date-calendar" +START_DATE_ERROR_TAG_NAME = "campground-start-date-calendar-error" +END_DATE_INPUT_TAG_NAME = "campground-end-date-calendar" +END_DATE_ERROR_TAG_NAME = "campground-end-date-calendar-error" AVAILABILITY_TABLE_TAG_NAME = "availability-table" -TABLE_LOADING_TAG_CLASS = "rec-table-overlay" +TABLE_LOADING_TAG_CLASS = "rec-table-overlay-loading" CAMP_LOCATION_NAME_ICON = "camp-location-name--icon" -AVAILABILITY_TABLE_REFRESH_XPATH = """//*[@id="page-body"]/div/div[1]/div[1]/div[3]/div[1]/div[1]/div/div/button[1]""" +TUTORIAL_CLOSE_BUTTON_XPATH = "/html/body/div[11]/div/div/div/div/div/div/div/button" PAGE_LOAD_WAIT = 60 def parse_html_table(table: BeautifulSoup) -> DataFrame: @@ -87,18 +99,23 @@ def all_dates_available(df: DataFrame, start_date: datetime, num_days: int) -> b return at_least_one_available -def create_selenium_driver() -> WebDriver: +def create_selenium_driver(headless: bool=True) -> WebDriver: """ Initialize Selenium WebDriver object and return it to the caller. Do this in a separate function to allow driver re-use across rounds of scraping. Note: the remote debugging port option seems to be required for raspberry pi operation: https://stackoverflow.com/a/56638103 + :param headless: create GUI for WebDriver? Testing usage passes in False, defaults to True :returns: Selenium WebDriver object """ - chrome_options = webdriver.ChromeOptions() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--remote-debugging-port=9222") - driver = webdriver.Chrome(options=chrome_options) + options = Options() + options.add_argument("enable-automation") # necessary for driving Chromium actions + if headless: + options.add_argument("--headless") + options.add_argument("--remote-debugging-port=9222") # necessary for driving Chromium actions + options.binary_location = "/usr/bin/chromium-browser" # browser is Chromium instead of Chrome + driver_path = "/usr/lib/chromium-browser/chromedriver" # we use custom chromedriver for raspberry + driver = webdriver.Chrome(options=options, service=Service(driver_path)) driver.implicitly_wait(PAGE_LOAD_WAIT) return driver @@ -119,6 +136,39 @@ def wait_for_page_element_load(driver: WebDriver, elem_id: str): logger.exception("Loading %s element on page took too much time; skipping this load.", elem_id) return None +def enter_date_input(date: datetime, date_input): + """ + As of 2022, recreation.gov requires inputting a start and end date separately to refresh the + table, so pull out this function so that we can call it for both boxes and avoid code reuse. + """ + date_input.send_keys(date.strftime("%m/%d/%Y")) + for _ in range(10): # backtrack to start of our input date + date_input.send_keys(Keys.ARROW_LEFT) + for _ in range(10): # delete default start date + date_input.send_keys(Keys.BACKSPACE) + date_input.send_keys(Keys.RETURN) + +def is_bad_date(driver: WebDriver, element_id) -> Tuple[bool, str]: + """ + Entering the date improperly causes dynamic text to appear right below the input box + indicating whether the date is invalid (formatting issue) or unavailable. Read this + text and return the appropriate message to the caller. + + NOTE: not currently used, but good to keep here in case the site changes and we need + this pattern again. + """ + date_error_msg = driver.find_element(by=By.ID, value=element_id) + invalid_str = "not valid" + unavailable_str = "not available" + logger.info(date_error_msg.text) + if date_error_msg is not None: + if unavailable_str in date_error_msg.text: + return (True, unavailable_str) + if invalid_str in date_error_msg.text: + return (True, invalid_str) + return (True, "new error") + return (False, "all good") + def scrape_campground(driver: WebDriver, campground: Campground, start_date: datetime, num_days: int) -> bool: """ Use Selenium WebDriver to load page, input desired start date, identify availability table @@ -147,27 +197,38 @@ def scrape_campground(driver: WebDriver, campground: Campground, start_date: dat try: logger.debug("\tGetting campground.url (%s) with driver", campground.url) driver.get(campground.url) + + try: # check for tutorial window and close it if it appears, otherwise table doesn't load correctly + tutorial_close_button = driver.find_element(by=By.XPATH, value=TUTORIAL_CLOSE_BUTTON_XPATH) + logger.debug("\tClosing tutorial window") + tutorial_close_button.click() + except NoSuchElementException: + logger.debug("\tNo tutorial this time") + pass # we don't actually care if tutorial didn't appear, just move on + logger.debug("\tFinding input box tag") - date_input = wait_for_page_element_load(driver, INPUT_TAG_NAME) - if date_input is None: # if wait for page element load fails -> abandon this check immediately + start_date_input = wait_for_page_element_load(driver, START_DATE_INPUT_TAG_NAME) + if start_date_input is None: # if wait for page element load fails -> abandon this check immediately return False - # date_input = driver.find_element_by_id(INPUT_TAG_NAME) - logger.debug("\tSending new date with send_keys") - date_input.send_keys(start_date.strftime("%m/%d/%Y")) - for _ in range(10): # backtrack to start of our input date - date_input.send_keys(Keys.ARROW_LEFT) - for _ in range(10): # delete default start date - date_input.send_keys(Keys.BACKSPACE) - date_input.send_keys(Keys.RETURN) - # manually click refresh table button to ensure valid table data - # (if you don't do this every cell might be filled with 'x') - refresh_table = driver.find_elements_by_xpath(AVAILABILITY_TABLE_REFRESH_XPATH)[0] - refresh_table.click() + logger.debug("\tInputting start/end dates with send_keys") + enter_date_input(start_date, start_date_input) + + end_date = start_date + timedelta(days=num_days) + end_date_input = wait_for_page_element_load(driver, END_DATE_INPUT_TAG_NAME) + if end_date_input is None: # if wait for page element load fails -> abandon this check immediately + return False + enter_date_input(end_date, end_date_input) # wait for table refresh/loading spinning wheel to disappear, otherwise table contents are gibberish/NaN # https://stackoverflow.com/a/29084080 -- wait for element to *not* be visible - loading_tag = driver.find_element_by_class_name(TABLE_LOADING_TAG_CLASS) - WebDriverWait(driver, PAGE_LOAD_WAIT).until(EC.invisibility_of_element(loading_tag)) + # https://stackoverflow.com/a/51884408 -- wait for element not to be visible even though it + # may already be invisible + # https://stackoverflow.com/a/45420111 -- temporarily kill implicit waits to make explicit wait work corectly + driver.implicitly_wait(0) + WebDriverWait(driver, PAGE_LOAD_WAIT).until(EC.invisibility_of_element_located( + (By.CLASS_NAME, TABLE_LOADING_TAG_CLASS))) + driver.implicitly_wait(PAGE_LOAD_WAIT) + logger.debug("\tFinding availability table tag") availability_table = wait_for_page_element_load(driver, AVAILABILITY_TABLE_TAG_NAME) if availability_table is None: # if wait for page element load fails -> abandon this check immediately @@ -188,17 +249,22 @@ def run(): """ Runs scrape availability module for specific values, should be used for debugging only. """ + signal(SIGINT, exit_gracefully) # add custom handler for SIGINT/CTRL-C # kirk_creek = "https://www.recreation.gov/camping/campgrounds/233116/availability" # kirk_start_date_str = "09/17/2021" - mcgill = "https://www.recreation.gov/camping/campgrounds/231962/availability" - mcgill_start_date_str = "05/31/2021" + # mcgill = "https://www.recreation.gov/camping/campgrounds/231962/availability" + mcgill_start_date = datetime.strptime("05/31/2022", "%m/%d/%Y") num_days = 2 + mcgill_campground = Campground(name="McGill", facility_id="231962") - driver = create_selenium_driver() - if scrape_campground(driver, mcgill, mcgill_start_date_str, num_days): + driver = create_selenium_driver(headless=True) + if scrape_campground(driver, mcgill_campground, mcgill_start_date, num_days): logger.info("WE HAVE SOMETHING AVAILABLE!") else: logger.info("sad") + sleep(10000000) + driver.quit() if __name__ == "__main__": + setup_logging() run() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..7ef244e --- /dev/null +++ b/utils.py @@ -0,0 +1,63 @@ +""" +utils.py + +fill this out later +""" + +import sys +import logging +from logging.handlers import TimedRotatingFileHandler +from selenium.webdriver.chrome.webdriver import WebDriver + +logger = logging.getLogger(__name__) + +def exit_gracefully(signal_received, frame, close_this_driver: WebDriver=None): + """ + Handler for SIGINT that will close webdriver carefully if necessary. + Ref: https://www.devdungeon.com/content/python-catch-sigint-ctrl-c + https://docs.python.org/3/library/signal.html + + :param signal_received: signal object received by handler + :param frame: actually have no idea what this is and we never use it... + :param driver: Selenium WebDriver to close before exiting + :returns: N/A + """ + logger.info("Received CTRL-C/SIGNINT or daemon completed; exiting gracefully/closing WebDriver if initialized.") + if close_this_driver is not None: + # use quit instead of close to avoid tons of leftover chrome processes + # https://stackoverflow.com/questions/15067107/difference-between-webdriver-dispose-close-and-quit + close_this_driver.quit() + logger.info("WebDriver Quit Successfully") + sys.exit(0) + +def set_low_network_quality(driver: WebDriver) -> None: + """ + Set WebDriver to simulate low network quality -- 5ms additional latency, only 500kb + throughput. Mostly used when testing the load actions of the availability table; sometimes + there is a web element that shows up and spins for a while while the wait happens and it + can be difficult to find the name for that without the appropriate delays set. + + Should never be used in production code, but left here for future testing needs. + """ + latency_delay_ms = 5 + download_throughput_kb = 500 + upload_throughput_kb = 500 + driver.set_network_conditions( + offline=False, + latency=latency_delay_ms, + download_throughput=download_throughput_kb, + upload_throughput=upload_throughput_kb) + +def setup_logging() -> None: + """ + Set logging for whole project. This function is called from wherever the program + is invoked. Could be from different places given development of different components + separately. + """ + rotating_handler = TimedRotatingFileHandler("logs/recgov.log", when="d", interval=1, backupCount=5) + rotating_handler.suffix = "%Y-%m-%d" + logging.basicConfig( + handlers=[rotating_handler], + level=logging.INFO, + format="[%(asctime)s] %(filename)s:%(lineno)d [%(name)s]%(levelname)s - %(message)s", + )