Manye Dong 2023-10-05
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
weather_df =
rnoaa::meteo_pull_monitors(
c("USW00094728", "USW00022534", "USS0023B17S"),
var = c("PRCP", "TMIN", "TMAX"),
date_min = "2021-01-01",
date_max = "2022-12-31") |>
mutate(
name = recode(
id,
USW00094728 = "CentralPark_NY",
USW00022534 = "Molokai_HI",
USS0023B17S = "Waterhole_WA"),
tmin = tmin / 10,
tmax = tmax / 10,
# floor_date is to round down the date in one month
month = lubridate::floor_date(date, unit = "month")) |>
select(name, id, everything())
## using cached file: /Users/caradong/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00094728.dly
## date created (size, mb): 2023-09-05 10:39:29.44826 (8.523)
## file min/max dates: 1869-01-01 / 2023-09-30
## using cached file: /Users/caradong/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00022534.dly
## date created (size, mb): 2023-09-28 10:14:47.893627 (3.83)
## file min/max dates: 1949-10-01 / 2023-09-30
## using cached file: /Users/caradong/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USS0023B17S.dly
## date created (size, mb): 2023-09-05 10:39:35.901108 (0.994)
## file min/max dates: 1999-09-01 / 2023-09-30
weather_df
## # A tibble: 2,190 × 7
## name id date prcp tmax tmin month
## <chr> <chr> <date> <dbl> <dbl> <dbl> <date>
## 1 CentralPark_NY USW00094728 2021-01-01 157 4.4 0.6 2021-01-01
## 2 CentralPark_NY USW00094728 2021-01-02 13 10.6 2.2 2021-01-01
## 3 CentralPark_NY USW00094728 2021-01-03 56 3.3 1.1 2021-01-01
## 4 CentralPark_NY USW00094728 2021-01-04 5 6.1 1.7 2021-01-01
## 5 CentralPark_NY USW00094728 2021-01-05 0 5.6 2.2 2021-01-01
## 6 CentralPark_NY USW00094728 2021-01-06 0 5 1.1 2021-01-01
## 7 CentralPark_NY USW00094728 2021-01-07 0 5 -1 2021-01-01
## 8 CentralPark_NY USW00094728 2021-01-08 0 2.8 -2.7 2021-01-01
## 9 CentralPark_NY USW00094728 2021-01-09 0 2.8 -4.3 2021-01-01
## 10 CentralPark_NY USW00094728 2021-01-10 0 5 -1.6 2021-01-01
## # ℹ 2,180 more rows
weather_df |>
ggplot(aes(x = prcp)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (`stat_bin()`).
Here are the big outliers: any formal analyses involving precipitation as a predictor or outcome might be influenced by this fact
weather_df |> filter(prcp > 1000)
## # A tibble: 3 × 7
## name id date prcp tmax tmin month
## <chr> <chr> <date> <dbl> <dbl> <dbl> <date>
## 1 CentralPark_NY USW00094728 2021-08-21 1130 27.8 22.8 2021-08-01
## 2 CentralPark_NY USW00094728 2021-09-01 1811 25.6 17.2 2021-09-01
## 3 Molokai_HI USW00022534 2022-12-18 1120 23.3 18.9 2022-12-01
weather_df |>
filter(tmax >= 20, tmax <= 30) |>
ggplot(aes(x = tmin, y = tmax, color = name, shape = name)) +
geom_point(alpha = .75)
Waterhole is doing sth fundamentally different
weather_df |>
# often somewhat invisible, but can only see the note "groups: name[3], denoting the unique groups"
group_by(name, month) |>
# the next line is what gives you a new aggregated column
count(month, month = "n_obs")
## # A tibble: 3 × 3
## # Groups: name, month [3]
## name month n
## <chr> <chr> <int>
## 1 CentralPark_NY n_obs 730
## 2 Molokai_HI n_obs 730
## 3 Waterhole_WA n_obs 730
Count produces a dataframe you can use or manipulate directly
weather_df |>
count(name, month) |>
# pivot_wider is to untidy something
pivot_wider(
# names from means the columns
names_from = name,
values_from = n
) |>
knitr::kable(digits=2)
month | CentralPark_NY | Molokai_HI | Waterhole_WA |
---|---|---|---|
2021-01-01 | 31 | 31 | 31 |
2021-02-01 | 28 | 28 | 28 |
2021-03-01 | 31 | 31 | 31 |
2021-04-01 | 30 | 30 | 30 |
2021-05-01 | 31 | 31 | 31 |
2021-06-01 | 30 | 30 | 30 |
2021-07-01 | 31 | 31 | 31 |
2021-08-01 | 31 | 31 | 31 |
2021-09-01 | 30 | 30 | 30 |
2021-10-01 | 31 | 31 | 31 |
2021-11-01 | 30 | 30 | 30 |
2021-12-01 | 31 | 31 | 31 |
2022-01-01 | 31 | 31 | 31 |
2022-02-01 | 28 | 28 | 28 |
2022-03-01 | 31 | 31 | 31 |
2022-04-01 | 30 | 30 | 30 |
2022-05-01 | 31 | 31 | 31 |
2022-06-01 | 30 | 30 | 30 |
2022-07-01 | 31 | 31 | 31 |
2022-08-01 | 31 | 31 | 31 |
2022-09-01 | 30 | 30 | 30 |
2022-10-01 | 31 | 31 | 31 |
2022-11-01 | 30 | 30 | 30 |
2022-12-01 | 31 | 31 | 31 |
weather_df |>
group_by(name) |>
summarize(
mean_tmax = mean(tmax, na.rm = TRUE),
std_tmax = sd(tmax, na.rm = TRUE),
median_tmax = median(tmax, na.rm = TRUE)
)
## # A tibble: 3 × 4
## name mean_tmax std_tmax median_tmax
## <chr> <dbl> <dbl> <dbl>
## 1 CentralPark_NY 17.7 9.96 18.9
## 2 Molokai_HI 28.3 1.80 28.3
## 3 Waterhole_WA 7.38 7.55 6.1
# by default, na.rm is FALSE, it will take NA as NA
Plot the line plot:
weather_df |>
group_by(name, month) |>
summarize(
mean_tmax = mean(tmax, na.rm = TRUE)
) |>
ggplot(aes(x=month, y=mean_tmax, color=name)) +
geom_point() +
geom_line()
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.