From 425088e82e1ca8730919a9e74705917fe8cc8ec2 Mon Sep 17 00:00:00 2001 From: Jordi Rosell Date: Thu, 17 Oct 2024 12:50:59 +0200 Subject: [PATCH 1/5] Fix read_html_live example I checked the read_html_live example and saw that the css selectors changed and a cookie consent banner was added. This PR is to changed the read_html_live() example, so it can reject cookies and extract organizations with the new page version. Scroll was needed to force the JSON file download. I used |>, but I can change my PR to %>% if required. --- R/live.R | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/R/live.R b/R/live.R index 91f4a1b..498f11d 100644 --- a/R/live.R +++ b/R/live.R @@ -27,16 +27,21 @@ #' # When we retrieve the raw HTML for this site, it doesn't contain the #' # data we're interested in: #' static <- read_html("https://www.forbes.com/top-colleges/") -#' static %>% html_elements(".TopColleges2023_tableRow__BYOSU") +#' static |> html_elements(".ListTable_listTable__-N5U5") #' #' # Instead, we need to run the site in a real web browser, causing it to #' # download a JSON file and then dynamically generate the html: -#' #' sess <- read_html_live("https://www.forbes.com/top-colleges/") #' sess$view() -#' rows <- sess %>% html_elements(".TopColleges2023_tableRow__BYOSU") -#' rows %>% html_element(".TopColleges2023_organizationName__J1lEV") %>% html_text() -#' rows %>% html_element(".grant-aid") %>% html_text() +#' sess$scroll_into_view("#top-colleges") +#' cookies_seen <- length(html_elements(sess, "button[aria-label='Reject All']")) +#' if (cookies_seen) { +#' sess$click("button[aria-label='Accept All']") +#' } +#' rows <- sess |> html_elements("#top-colleges .ListTable_listTable__-N5U5") +#' rows |> +#' html_elements("#top-colleges tbody tr td:nth-of-type(2)") |> +#' html_text() #' } read_html_live <- function(url) { check_installed(c("chromote", "R6")) From b95e6051dc8359b0e6b45b83ccdf61c76470ef9a Mon Sep 17 00:00:00 2001 From: Jordi Rosell Date: Thu, 17 Oct 2024 12:53:19 +0200 Subject: [PATCH 2/5] Change to %>% pipe --- R/live.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/live.R b/R/live.R index 498f11d..01cdd36 100644 --- a/R/live.R +++ b/R/live.R @@ -27,7 +27,7 @@ #' # When we retrieve the raw HTML for this site, it doesn't contain the #' # data we're interested in: #' static <- read_html("https://www.forbes.com/top-colleges/") -#' static |> html_elements(".ListTable_listTable__-N5U5") +#' static %>% html_elements(".ListTable_listTable__-N5U5") #' #' # Instead, we need to run the site in a real web browser, causing it to #' # download a JSON file and then dynamically generate the html: @@ -38,9 +38,9 @@ #' if (cookies_seen) { #' sess$click("button[aria-label='Accept All']") #' } -#' rows <- sess |> html_elements("#top-colleges .ListTable_listTable__-N5U5") -#' rows |> -#' html_elements("#top-colleges tbody tr td:nth-of-type(2)") |> +#' rows <- sess %>% html_elements("#top-colleges .ListTable_listTable__-N5U5") +#' rows %>% +#' html_elements("#top-colleges tbody tr td:nth-of-type(2)") %>% #' html_text() #' } read_html_live <- function(url) { From 4b6da44f447f484f12b32ee5d0afd9002c27c9c6 Mon Sep 17 00:00:00 2001 From: Jordi Rosell Date: Thu, 17 Oct 2024 12:57:10 +0200 Subject: [PATCH 3/5] I assume Reject All would be better --- R/live.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/live.R b/R/live.R index 01cdd36..5e99e48 100644 --- a/R/live.R +++ b/R/live.R @@ -36,7 +36,7 @@ #' sess$scroll_into_view("#top-colleges") #' cookies_seen <- length(html_elements(sess, "button[aria-label='Reject All']")) #' if (cookies_seen) { -#' sess$click("button[aria-label='Accept All']") +#' sess$click("button[aria-label='Reject All']") #' } #' rows <- sess %>% html_elements("#top-colleges .ListTable_listTable__-N5U5") #' rows %>% From 4698b8cd33162908e590ef0fa237f4d1d4cb8398 Mon Sep 17 00:00:00 2001 From: Jordi Rosell Date: Thu, 17 Oct 2024 13:05:10 +0200 Subject: [PATCH 4/5] Fix scroll --- R/live.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/live.R b/R/live.R index 5e99e48..8106ab2 100644 --- a/R/live.R +++ b/R/live.R @@ -33,14 +33,14 @@ #' # download a JSON file and then dynamically generate the html: #' sess <- read_html_live("https://www.forbes.com/top-colleges/") #' sess$view() -#' sess$scroll_into_view("#top-colleges") #' cookies_seen <- length(html_elements(sess, "button[aria-label='Reject All']")) #' if (cookies_seen) { #' sess$click("button[aria-label='Reject All']") #' } +#' sess$scroll_to(top = 2000) #' rows <- sess %>% html_elements("#top-colleges .ListTable_listTable__-N5U5") #' rows %>% -#' html_elements("#top-colleges tbody tr td:nth-of-type(2)") %>% +#' html_elements("tr td:nth-of-type(2)") %>% #' html_text() #' } read_html_live <- function(url) { From 738875ab52cfcdb2a30f902304d349dba5e470c4 Mon Sep 17 00:00:00 2001 From: Jordi Rosell Date: Thu, 17 Oct 2024 14:39:08 +0200 Subject: [PATCH 5/5] Fixed not finding css element when rejecting cookies --- R/live.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/R/live.R b/R/live.R index 8106ab2..f599bcb 100644 --- a/R/live.R +++ b/R/live.R @@ -33,15 +33,21 @@ #' # download a JSON file and then dynamically generate the html: #' sess <- read_html_live("https://www.forbes.com/top-colleges/") #' sess$view() +#' Sys.sleep(2) #' cookies_seen <- length(html_elements(sess, "button[aria-label='Reject All']")) #' if (cookies_seen) { #' sess$click("button[aria-label='Reject All']") +#' sess <- read_html_live("https://www.forbes.com/top-colleges/") +#' sess$view() +#' Sys.sleep(2) #' } -#' sess$scroll_to(top = 2000) -#' rows <- sess %>% html_elements("#top-colleges .ListTable_listTable__-N5U5") -#' rows %>% -#' html_elements("tr td:nth-of-type(2)") %>% -#' html_text() +#' table_seen <- length(html_elements(sess, "table")) +#' if (table_seen) { +#' rows <- sess |> html_elements("table tr") +#' rows |> +#' html_elements("td:nth-of-type(2)") |> +#' html_text() |> +#' print() #' } read_html_live <- function(url) { check_installed(c("chromote", "R6"))