Skip to content

Latest commit

 

History

History
218 lines (179 loc) · 7.37 KB

eda.md

File metadata and controls

218 lines (179 loc) · 7.37 KB

eda

Manye Dong 2023-10-05

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
weather_df = 
  rnoaa::meteo_pull_monitors(
    c("USW00094728", "USW00022534", "USS0023B17S"),
    var = c("PRCP", "TMIN", "TMAX"), 
    date_min = "2021-01-01",
    date_max = "2022-12-31") |>
  mutate(
    name = recode(
      id, 
      USW00094728 = "CentralPark_NY", 
      USW00022534 = "Molokai_HI",
      USS0023B17S = "Waterhole_WA"),
    tmin = tmin / 10,
    tmax = tmax / 10,
    # floor_date is to round down the date in one month
    month = lubridate::floor_date(date, unit = "month")) |>
  select(name, id, everything())
## using cached file: /Users/caradong/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00094728.dly

## date created (size, mb): 2023-09-05 10:39:29.44826 (8.523)

## file min/max dates: 1869-01-01 / 2023-09-30

## using cached file: /Users/caradong/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00022534.dly

## date created (size, mb): 2023-09-28 10:14:47.893627 (3.83)

## file min/max dates: 1949-10-01 / 2023-09-30

## using cached file: /Users/caradong/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USS0023B17S.dly

## date created (size, mb): 2023-09-05 10:39:35.901108 (0.994)

## file min/max dates: 1999-09-01 / 2023-09-30
weather_df
## # A tibble: 2,190 × 7
##    name           id          date        prcp  tmax  tmin month     
##    <chr>          <chr>       <date>     <dbl> <dbl> <dbl> <date>    
##  1 CentralPark_NY USW00094728 2021-01-01   157   4.4   0.6 2021-01-01
##  2 CentralPark_NY USW00094728 2021-01-02    13  10.6   2.2 2021-01-01
##  3 CentralPark_NY USW00094728 2021-01-03    56   3.3   1.1 2021-01-01
##  4 CentralPark_NY USW00094728 2021-01-04     5   6.1   1.7 2021-01-01
##  5 CentralPark_NY USW00094728 2021-01-05     0   5.6   2.2 2021-01-01
##  6 CentralPark_NY USW00094728 2021-01-06     0   5     1.1 2021-01-01
##  7 CentralPark_NY USW00094728 2021-01-07     0   5    -1   2021-01-01
##  8 CentralPark_NY USW00094728 2021-01-08     0   2.8  -2.7 2021-01-01
##  9 CentralPark_NY USW00094728 2021-01-09     0   2.8  -4.3 2021-01-01
## 10 CentralPark_NY USW00094728 2021-01-10     0   5    -1.6 2021-01-01
## # ℹ 2,180 more rows

Initial Numeric work

weather_df |> 
  ggplot(aes(x = prcp)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 15 rows containing non-finite values (`stat_bin()`).

Here are the big outliers: any formal analyses involving precipitation as a predictor or outcome might be influenced by this fact

weather_df |> filter(prcp > 1000)
## # A tibble: 3 × 7
##   name           id          date        prcp  tmax  tmin month     
##   <chr>          <chr>       <date>     <dbl> <dbl> <dbl> <date>    
## 1 CentralPark_NY USW00094728 2021-08-21  1130  27.8  22.8 2021-08-01
## 2 CentralPark_NY USW00094728 2021-09-01  1811  25.6  17.2 2021-09-01
## 3 Molokai_HI     USW00022534 2022-12-18  1120  23.3  18.9 2022-12-01
weather_df |> 
  filter(tmax >= 20, tmax <= 30) |> 
  ggplot(aes(x = tmin, y = tmax, color = name, shape = name)) + 
  geom_point(alpha = .75)

Waterhole is doing sth fundamentally different

Grouping

weather_df |>
  # often somewhat invisible, but can only see the note "groups: name[3], denoting the unique groups"
  group_by(name, month) |>
  # the next line is what gives you a new aggregated column
  count(month, month = "n_obs")
## # A tibble: 3 × 3
## # Groups:   name, month [3]
##   name           month     n
##   <chr>          <chr> <int>
## 1 CentralPark_NY n_obs   730
## 2 Molokai_HI     n_obs   730
## 3 Waterhole_WA   n_obs   730

Count produces a dataframe you can use or manipulate directly

weather_df |>
  count(name, month) |>
  # pivot_wider is to untidy something
  pivot_wider(
    # names from means the columns
    names_from = name,
    values_from = n
  ) |>
  knitr::kable(digits=2)
month CentralPark_NY Molokai_HI Waterhole_WA
2021-01-01 31 31 31
2021-02-01 28 28 28
2021-03-01 31 31 31
2021-04-01 30 30 30
2021-05-01 31 31 31
2021-06-01 30 30 30
2021-07-01 31 31 31
2021-08-01 31 31 31
2021-09-01 30 30 30
2021-10-01 31 31 31
2021-11-01 30 30 30
2021-12-01 31 31 31
2022-01-01 31 31 31
2022-02-01 28 28 28
2022-03-01 31 31 31
2022-04-01 30 30 30
2022-05-01 31 31 31
2022-06-01 30 30 30
2022-07-01 31 31 31
2022-08-01 31 31 31
2022-09-01 30 30 30
2022-10-01 31 31 31
2022-11-01 30 30 30
2022-12-01 31 31 31

General Summaries

weather_df |>
  group_by(name) |>
  summarize(
    mean_tmax = mean(tmax, na.rm = TRUE), 
    std_tmax = sd(tmax, na.rm = TRUE), 
    median_tmax = median(tmax, na.rm = TRUE)
  )
## # A tibble: 3 × 4
##   name           mean_tmax std_tmax median_tmax
##   <chr>              <dbl>    <dbl>       <dbl>
## 1 CentralPark_NY     17.7      9.96        18.9
## 2 Molokai_HI         28.3      1.80        28.3
## 3 Waterhole_WA        7.38     7.55         6.1
# by default, na.rm is FALSE, it will take NA as NA

Plot the line plot:

weather_df |>
  group_by(name, month) |>
  summarize(
    mean_tmax = mean(tmax, na.rm = TRUE)
    ) |>
  ggplot(aes(x=month, y=mean_tmax, color=name)) + 
  geom_point() +
  geom_line()
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.