Merge pull request #377 from ecmerkle/master

add option allow.empty.cell
yrosseel · Aug 5, 2024 · 56b8a11 · 56b8a11
2 parents 6606f50 + fa2e74a
commit 56b8a11
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 15 deletions.
diff --git a/R/lav_data.R b/R/lav_data.R
@@ -45,6 +45,9 @@ lavData <- function(data = NULL, # data.frame
     level.label <- character(0L)
   }
 
+  # allow empty categories of ordinal variable
+  allow.empty.cell <- lavoptions$allow.empty.cell
+
   # block.labels
   block.label <- character(0L)
   if (length(group.label) > 0L && length(level.label) == 0L) {
@@ -147,7 +150,8 @@ lavData <- function(data = NULL, # data.frame
       ov.names.l = ov.names.l,
       std.ov = std.ov,
       missing = missing,
-      allow.single.case = allow.single.case
+      allow.single.case = allow.single.case,
+      allow.empty.cell = allow.empty.cell
     )
     sample.cov <- NULL # not needed, but just in case
   }
@@ -504,7 +508,8 @@ lav_data_full <- function(data = NULL, # data.frame
                           ov.names.l = list(), # var per level
                           std.ov = FALSE, # standardize ov's?
                           missing = "listwise", # remove missings?
-                          allow.single.case = FALSE # allow single case?
+                          allow.single.case = FALSE, # allow single case?
+                          allow.empty.cell = FALSE
 ) {
   # number of groups and group labels
   if (!is.null(group) && length(group) > 0L) {
@@ -552,6 +557,9 @@ lav_data_full <- function(data = NULL, # data.frame
     group <- character(0L)
   }
 
+  # ensure allow.empty.cell is logical
+  if (is.null(allow.empty.cell)) allow.empty.cell <- FALSE
+
   # sampling weights
   if (!is.null(sampling.weights)) {
     if (is.character(sampling.weights)) {
@@ -708,7 +716,7 @@ lav_data_full <- function(data = NULL, # data.frame
   ov <- lav_dataframe_vartable(
     frame = data, ov.names = ov.names,
     ov.names.x = ov.names.x, ordered = ordered,
-    as.data.frame. = FALSE
+    as.data.frame. = FALSE, allow.empty.cell = allow.empty.cell
   )
 
   # do some checking
@@ -829,6 +837,7 @@ lav_data_full <- function(data = NULL, # data.frame
   weights <- vector("list", length = ngroups)
 
   # collect information per upper-level group
+  datam <- data.matrix(data)
   for (g in 1:ngroups) {
     # extract variables in correct order
     if (nlevels > 1L) {
@@ -895,7 +904,7 @@ lav_data_full <- function(data = NULL, # data.frame
     }
 
     # extract data
-    X[[g]] <- data.matrix(data[case.idx[[g]], ov.idx, drop = FALSE])
+    X[[g]] <- datam[case.idx[[g]], ov.idx, drop = FALSE]
     dimnames(X[[g]]) <- NULL ### copy?
 
     # sampling weights (but no normalization yet)
@@ -924,7 +933,7 @@ lav_data_full <- function(data = NULL, # data.frame
     if (length(user.ordered.idx) > 0L) {
       for (i in user.ordered.idx) {
         X[[g]][, i][is.na(X[[g]][, i])] <- NA # change NaN to NA
-        X[[g]][, i] <- as.numeric(as.factor(X[[g]][, i]))
+        if (!allow.empty.cell) X[[g]][, i] <- as.numeric(as.factor(X[[g]][, i]))
         # possible alternative to the previous two lines:
         # X[[g]][,i] <- as.numeric(factor(X[[g]][,i], exclude = c(NA, NaN)))
       }

diff --git a/R/lav_dataframe.R b/R/lav_dataframe.R
@@ -12,7 +12,8 @@ lav_dataframe_vartable <- function(frame = NULL, ov.names = NULL,
                                    ov.names.x = NULL,
                                    ordered = NULL,
                                    factor = NULL,
-                                   as.data.frame. = FALSE) {
+                                   as.data.frame. = FALSE,
+                                   allow.empty.cell = FALSE) {
   if (missing(ov.names)) {
     var.names <- names(frame)
   } else {
@@ -58,9 +59,18 @@ lav_dataframe_vartable <- function(frame = NULL, ov.names = NULL,
     # handle ordered/factor
     if (!is.null(ordered) && var.names[i] %in% ordered) {
       type.x <- "ordered"
-      lev <- sort(unique(x)) # we assume integers!
-      nlev[i] <- length(lev)
-      lnam[i] <- paste(lev, collapse = "|")
+      if (allow.empty.cell) {
+        nlev[i] <- max(as.numeric(x))
+        if (inherits(x, 'factor')) {
+          lnam[i] <- paste(levels(x), collapse = "|")
+        } else {
+          lnam[i] <- paste(1:nlev[i], collapse = "|")
+        }
+      } else {
+        lev <- sort(unique(x)) # we assume integers!
+        nlev[i] <- length(lev)
+        lnam[i] <- paste(lev, collapse = "|")
+      }
       user[i] <- 1L
     } else if (!is.null(factor) && var.names[i] %in% factor) {
       type.x <- "factor"

diff --git a/R/lav_muthen1984.R b/R/lav_muthen1984.R
@@ -24,6 +24,7 @@ muthen1984 <- function(Data = NULL,
                        zero.keep.margins = TRUE,
                        zero.cell.warn = FALSE,
                        zero.cell.tables = TRUE,
+                       allow.empty.cell = TRUE,
                        group = 1L) { # group only for error messages
 
   # just in case Data is a vector
@@ -58,7 +59,7 @@ muthen1984 <- function(Data = NULL,
   step1 <- lav_samplestats_step1(
     Y = Data, wt = wt, ov.names = ov.names,
     ov.types = ov.types, ov.levels = ov.levels, ov.names.x = ov.names.x,
-    eXo = eXo, scores.flag = WLS.W, group = group
+    eXo = eXo, scores.flag = WLS.W, allow.empty.cell = allow.empty.cell, group = group
   )
 
   FIT <- step1$FIT

diff --git a/R/lav_options_default.R b/R/lav_options_default.R
@@ -417,6 +417,7 @@ lav_options_default <- function() {
       nm = "[0, 1]", oklen = c(1L, -2L))
   elm("zero.keep.margins", "default", chr = "default", bl = TRUE)
   elm("zero.cell.warn", FALSE, bl = TRUE) # since 0.6-1
+  elm("allow.empty.cell", FALSE, bl = TRUE) # since 0.6-19
   elm("cat.wls.w", TRUE, bl = TRUE) # since 0.6-18
 
   # starting values (char values checked in lav_options_set())

diff --git a/R/lav_samplestats.R b/R/lav_samplestats.R
@@ -27,6 +27,7 @@ lav_samplestats_from_data <- function(lavdata = NULL,
   zero.add <- lavoptions$zero.add
   zero.keep.margins <- lavoptions$zero.keep.margins
   zero.cell.warn <- lavoptions$zero.cell.warn
+  allow.empty.cell <- lavoptions$allow.empty.cell
   dls.a <- lavoptions$estimator.args$dls.a
   dls.GammaNT <- lavoptions$estimator.args$dls.GammaNT
 
@@ -299,7 +300,8 @@ lav_samplestats_from_data <- function(lavdata = NULL,
           zero.add = zero.add,
           zero.keep.margins = zero.keep.margins,
           zero.cell.warn = FALSE,
-          zero.cell.tables = TRUE
+          zero.cell.tables = TRUE,
+          allow.empty.cell = allow.empty.cell
         )
       } else {
         CAT <- muthen1984(
@@ -315,7 +317,8 @@ lav_samplestats_from_data <- function(lavdata = NULL,
           zero.add = zero.add,
           zero.keep.margins = zero.keep.margins,
           zero.cell.warn = FALSE,
-          zero.cell.tables = TRUE
+          zero.cell.tables = TRUE,
+          allow.empty.cell = allow.empty.cell
         )
       }
       lav_verbose(current.verbose)

diff --git a/R/lav_samplestats_step1.R b/R/lav_samplestats_step1.R
@@ -6,6 +6,7 @@ lav_samplestats_step1 <- function(Y,
                                   ov.names.x = character(0L),
                                   eXo = NULL,
                                   scores.flag = TRUE, # scores?
+                                  allow.empty.cell = TRUE, # allow empty categories?
                                   group = 1L) { # for error message
 
 
@@ -73,13 +74,13 @@ lav_samplestats_step1 <- function(Y,
       # check if we have enough categories in this group
       # FIXME: should we more tolerant here???
       y.freq <- tabulate(Y[, i], nbins = ov.levels[i])
-      if (length(y.freq) != ov.levels[i]) {
+      if (length(y.freq) != ov.levels[i] & !allow.empty.cell) {
         lav_msg_stop(gettextf(
           "variable %1$s has fewer categories (%2$s) than
           expected (%3$s) in group %4$s", ov.names[i],
           length(y.freq), ov.levels[i], group))
       }
-      if (any(y.freq == 0L)) {
+      if (any(y.freq == 0L) & !allow.empty.cell) {
         lav_msg_stop(gettextf(
           "some categories of variable `%1$s' are empty in group %2$s;
           frequencies are [%3$s]", ov.names[i], group,
@@ -93,7 +94,35 @@ lav_samplestats_step1 <- function(Y,
       }
       FIT[[i]] <- fit
       TH[[i]] <- fit$theta[fit$th.idx]
-      TH.NOX[[i]] <- lav_uvord_th(y = Y[, i], wt = wt)
+      fit.nox <- lav_uvord_th(y = Y[, i], wt = wt)
+      if (allow.empty.cell) {
+        if (any(y.freq == 0L)) {
+          nzidx <- y.freq != 0L
+          exidx <- (nzidx[1:(ov.levels[i] - 1)] * nzidx[2:ov.levels[i]]) == 1
+          misidx <- (nzidx[1:(ov.levels[i] - 1)] * nzidx[2:ov.levels[i]]) == 0
+          TH[[i]] <- TH.NOX[[i]] <- rep(0, ov.levels[i] - 1)
+          TH[[i]][exidx] <- fit$theta[fit$th.idx]
+          TH.NOX[[i]][exidx] <- fit.nox[exidx]
+
+          for (k in which(misidx)) {
+            if (k == 1) {
+              TH[[i]][k] <- -4
+              TH.NOX[[i]][k] <- -4
+            } else if (k == (ov.levels[i] - 1)) {
+              TH[[i]][k] <- 4
+              TH.NOX[[i]][k] <- 4
+            } else {
+              TH[[i]][k] <- TH[[i]][(k - 1)] + .01
+              TH.NOX[[i]][k] <- TH.NOX[[i]][(k - 1)] + .01
+            }
+          }
+        } else if (length(y.freq) != ov.levels[i]) {
+          nz <- ov.levels[i] - length(y.freq)
+          TH[[i]] <- c(TH[[i]], TH[[i]][length(y.freq)] + (1:nz) * .01)
+        }
+      } else {
+        TH.NOX[[i]] <- fit.nox
+      }
       if (scores.flag) {
         scores <- lav_uvord_scores(y = Y[, i], X = eXo, wt = wt)
         SC.TH[, th.idx] <- scores[, fit$th.idx, drop = FALSE]

diff --git a/man/lavOptions.Rd b/man/lavOptions.Rd
@@ -458,6 +458,9 @@ Categorical estimation options:
     \item{\code{zero.cell.warn}:}{Logical. Only used if some observed 
       endogenous variables are categorical. If \code{TRUE}, give a warning if 
       one or more cells of a bivariate frequency table are empty.}
+    \item{\code{allow.empty.cell}:}{Logical. If \code{TRUE}, ignore
+      situations where an ordinal variable has fewer categories than
+      expected, or where a category is empty in a specific group.}
 }
 
 Starting values options: