Skip to content

Commit

Permalink
add calculation using ReGenesees
Browse files Browse the repository at this point in the history
  • Loading branch information
hansvancalster committed Nov 26, 2024
1 parent fe7a627 commit d21fb8f
Showing 1 changed file with 192 additions and 2 deletions.
194 changes: 192 additions & 2 deletions source/scripts/test_validation_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,12 @@ oa_df
waldo::compare(oa_df$oa_est, aa$accuracy[1])
waldo::compare(sqrt(oa_df$oa_var), aa$accuracy[2])


ua_pa_df <- calc_ua_pa(
maparea = maparea$area,
ma = as.data.frame.matrix(reschange1$table)
)

ua_pa_df
waldo::compare(unname(ua_pa_df$ua_est), aa$stats$ua, tolerance = 1e-10)
waldo::compare(unname(sqrt(ua_pa_df$ua_var)), aa$stats$ua_se, tolerance = 1e-3)
Expand Down Expand Up @@ -290,7 +292,8 @@ mapac::aa_confusion_matrix_flextable(
proportion = TRUE,
diagonal = TRUE,
format.body = "%.2f",
format.accuracy = "%.3f"
format.accuracy = "%.3f",
rotate.header = TRUE
)

areas_df <- calc_areas(
Expand All @@ -312,8 +315,189 @@ waldo::compare(
tolerance = 1e-5
)

# estimate areas using ReGenesees
prop_area_h <- observed_changes |>
distinct(map, area) |>
mutate(
prop_area_h = area / sum(area)
)
sum(prop_area_h$area)

svydata <- observed_changes |>
mutate(
weights = 1/ ips,
ids = paste0("id_", 1:n()),
ones = 1,
oa = as.numeric(map == ref)
) |>
mutate(
n_ref = n(),
.by = ref
) |>
inner_join(
prop_area_h |> select(-area), by = join_by(map)
)
design <- ReGenesees::e.svydesign(
data = svydata,
ids = ~ ids,
strata = ~ map,
weights = ~ weights,
fpc = ~ ips)

df.pop <- ReGenesees::pop.template(
data = svydata,
calmodel = ~ map - 1
) # see what the template should look like
ReGenesees::pop.desc(df.pop)
df.pop <- maparea |>
select(changecat, area) |>
pivot_wider(
names_from = changecat,
names_prefix = "map",
values_from = area) |>
as.data.frame()
# calibrate on map marginal totals
# this adds the calibration weights to the design
cal <- ReGenesees::e.calibrate(
design = design,
df.population = df.pop,
calmodel = ~ map - 1
)
# but because in this case, apparently,
# the ratio between calibrated weights and initial weights equals 1;
# so using cal or design in estimation will have the same result
# this is probably logical because inverse probability weights (N_h/n_h) already
# incorporate the map marginal totals per stratum
# this would not be the case if we calibrate on different strata or
# additional strata and auxiliary information
ReGenesees::g.range(cal)

# estimate areas for ref
cal_ref_areas <- ReGenesees::svystatTM(
design = cal,
y = ~ ref,
estimator = "Total",
conf.int = TRUE,
deff = TRUE)

waldo::compare(aa$area$area, cal_ref_areas$Total, tolerance = 1e-10)
waldo::compare(aa$area$area_ci, 1.96*cal_ref_areas$SE, tolerance = 1e-4)

sum(aa$area$area) - sum(prop_area_h$area)

# can we estimate the areas also via Bayes formula?
# $P(\text{referentie}=A) = \frac{UA}{PA}P(\text{kaart} = A)$
# first a naive attempt for just the estimator without SE
waldo::compare(
aa$area$area, aa$stats$ua / aa$stats$pa * maparea$area,
tolerance = 1e-4
) # only difference is one NA resulting from division pa 0
plot(aa$area$area, aa$stats$ua / aa$stats$pa * maparea$area)
abline(0, 1)
# how to estimate $\frac{UA}{PA}P(\text{kaart} = A)$ with ReGenesees?
# oa = n_jj = variable equalling one if ref == map, 0 otherwise
# n = n_i. = n_map the sample size per map stratum
# n_ref = n_.k = the number of ref cases
# UA / PA = (n_jj / n_i. ) / (n_jj / n_.k ) simplifies to n_.k / n_i.
# P(\text{kaart} = i) = area_i
# n_.k / n_i. * area_i = sum over i of n_ik / n_i. * area_i
# compare with eq 9 Olofsson:
# W_i = area proportion of map class i = area_i / area_tot
# n_ik = cell count map i ref k
# n_i. = row sum map i
# area estimator for ref class k =
# total map area times sum over i (from 1 to 25) of W_i * n_ik/n_i.
# which again equals sum over i (from 1 to 25) of area_i * n_ik/n_i.

# note that area_i/n_i. are inverse probability weights
# because our survey data are in long format, we can
# represent n_ik as a vector of ones in combi with by = ~ref
# to obtain the same as before using either an expression in svystatL or
cal_ref_areas_checkL <- ReGenesees::svystatL(
design = cal,
expr = expression(ones),
by = ~ ref,
conf.int = TRUE,
deff = TRUE)
cal_ref_areas_checkTM <- ReGenesees::svystatTM(
design = cal,
y = ~ ones,
by = ~ ref,
conf.int = TRUE,
deff = TRUE)
waldo::compare(cal_ref_areas_checkL$ones, cal_ref_areas$Total)
waldo::compare(cal_ref_areas_checkTM$Total.ones, cal_ref_areas$Total)
waldo::compare(cal_ref_areas_checkL$SE.ones, cal_ref_areas$SE)
waldo::compare(cal_ref_areas_checkTM$SE.Total.ones, cal_ref_areas$SE)

# can it also be used to calculate OA, UA and PA?
# OA: YES
cal_oa <- ReGenesees::svystatTM(
design = cal,
y = ~ oa,
estimator = "Mean",
conf.int = TRUE,
deff = TRUE)
waldo::compare(aa$accuracy[1], cal_oa$Mean, tolerance = 1e-10)
waldo::compare(aa$accuracy[2], cal_oa$SE, tolerance = 1e-5)

# UA: YES
# (which need be estimated by row of contingency table which means by "map")
cal_ua <- ReGenesees::svystatTM(
design = cal,
y = ~ ref,
by = ~ map,
estimator = "Mean",
conf.int = TRUE,
deff = TRUE)
dim(cal_ua)
# extract relevant estimates
ua_est <- diag(as.matrix(cal_ua[1:25, 2:26]))
ua_se <- diag(as.matrix(cal_ua[1:25, 2:26 + 25]))
ua_cil <- diag(as.matrix(cal_ua[1:25, 2:26 + 25 + 25]))
ua_ciu <- diag(as.matrix(cal_ua[1:25, 2:26 + 25 + 25 + 25]))
cal_ua <- cal_ua |>
as_tibble() |>
select(map) |>
mutate(
ua_est = ua_est, ua_se = ua_se, ua_cil = ua_cil, ua_ciu = ua_ciu
)
waldo::compare(aa$stats$ua, cal_ua$ua_est, tolerance = 1e-10)
waldo::compare(aa$stats$ua_se, cal_ua$ua_se, tolerance = 1e-2) # more conservative
hist(aa$stats$ua_se - cal_ua$ua_se)
plot(aa$stats$ua_se, cal_ua$ua_se)
abline(0, 1)

# PA
cal_pa <- ReGenesees::svystatTM(
design = cal,
y = ~ map,
by = ~ ref,
estimator = "Mean",
conf.int = TRUE,
deff = TRUE)
dim(cal_pa)
# extract relevant estimates
pa_est <- diag(as.matrix(cal_pa[1:25, 2:26]))
pa_se <- diag(as.matrix(cal_pa[1:25, 2:26 + 25]))
pa_cil <- diag(as.matrix(cal_pa[1:25, 2:26 + 25 + 25]))
pa_ciu <- diag(as.matrix(cal_pa[1:25, 2:26 + 25 + 25 + 25]))
cal_pa <- cal_pa |>
as_tibble() |>
select(ref) |>
mutate(
pa_est = pa_est, pa_se = pa_se, pa_cil = pa_cil, pa_ciu = pa_ciu
)
waldo::compare(aa$stats$pa, cal_pa$pa_est, tolerance = 1e-10)
waldo::compare(aa$stats$pa_se, cal_pa$pa_se, tolerance = 1e-2)
hist(aa$stats$pa_se - cal_pa$pa_se)
plot(aa$stats$pa_se, cal_pa$pa_se)
abline(0, 1)

areas_df %>%
inner_join(
aa$stats, by = join_by(class)
) %>%
separate(
class,
c("class_p1", "class_p2"),
Expand All @@ -322,7 +506,13 @@ areas_df %>%
) %>%
mutate(change = class_p1 != class_p2) %>%
mutate(
class = reorder(class, area_est_ha)
class = reorder(
sprintf(
"%s\n(n = %s; ua = %s; pa = %s)",
class, n_points, round(ua, 2), round(pa, 2)
),
area_est_ha
)
) %>%
ggplot() +
geom_pointrange(
Expand Down

0 comments on commit d21fb8f

Please sign in to comment.