From 48587275921671c4d296a778d967b12c88de299e Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Sat, 16 Jul 2016 15:22:04 -0400 Subject: [PATCH 01/12] introduce stan_lmList function this is based on lme4::lmList but is restricted to linear models and does partial pooling rather than no pooling --- NAMESPACE | 1 + R/loo.R | 11 ++- R/misc.R | 7 ++ R/posterior_predict.R | 9 ++- R/pp_data.R | 27 +++++-- R/priors.R | 6 +- R/rstanarm-package.R | 17 ++-- R/stan_biglm.R | 22 +++++- R/stan_biglm.fit.R | 89 +++++++++++++++------ R/stan_lm.fit.R | 1 + R/stan_lmList.R | 107 ++++++++++++++++++++++++++ R/stanreg-methods.R | 41 +++++++++- cleanup | 2 +- cleanup.win | 2 +- exec/lm.stan | 84 +++++++++++++++++++- man-roxygen/args-kappa_mean.R | 6 ++ tests/testthat/test_stan_functions.R | 43 +++++++++++ tests/testthat/test_stan_lm.R | 5 ++ vignettes/children/SETTINGS-rstan.txt | 1 + vignettes/lm.Rmd | 105 +++++++++++++++++++++---- 20 files changed, 518 insertions(+), 68 deletions(-) create mode 100644 R/stan_lmList.R create mode 100644 man-roxygen/args-kappa_mean.R diff --git a/NAMESPACE b/NAMESPACE index 9b471cc6a..ecfd4b26c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -76,6 +76,7 @@ export(stan_glmer.nb) export(stan_lm) export(stan_lm.fit) export(stan_lm.wfit) +export(stan_lmList) export(stan_lmer) export(stan_polr) export(stan_polr.fit) diff --git a/R/loo.R b/R/loo.R index 72783b3e5..38ca32b8e 100644 --- a/R/loo.R +++ b/R/loo.R @@ -362,7 +362,7 @@ ll_fun <- function(x) { f <- family(x) if (!is(f, "family") || is_scobit(x)) return(.ll_polr_i) - + if (is.lmList(x)) return(.ll_gaussian_lmList_i) get(paste0(".ll_", f$family, "_i")) } @@ -390,7 +390,7 @@ ll_args <- function(object, newdata = NULL) { if (is(f, "family") && !is_scobit(object)) { fname <- f$family if (!is.binomial(fname)) { - data <- data.frame(y, x) + data <- data.frame(y, as.matrix(x)) } else { if (NCOL(y) == 2L) { trials <- rowSums(y) @@ -405,7 +405,9 @@ ll_args <- function(object, newdata = NULL) { } draws$beta <- stanmat[, seq_len(ncol(x)), drop = FALSE] if (is.gaussian(fname)) - draws$sigma <- stanmat[, "sigma"] + draws$sigma <- stanmat[, grep("^sigma", colnames(stanmat))] + if (is.lmList(object)) + draws$groups <- as.integer(object$groups) if (is.gamma(fname)) draws$shape <- stanmat[, "shape"] if (is.ig(fname)) @@ -492,6 +494,9 @@ ll_args <- function(object, newdata = NULL) { val <- dnorm(data$y, mean = .mu(data,draws), sd = draws$sigma, log = TRUE) .weighted(val, data$weights) } +.ll_gaussian_lmList_i <- function(i, data, draws) { + val <- dnorm(data$y, mean = .mu(data,draws), sd = draws$sigma[,draws$group[i]], log = TRUE) +} .ll_binomial_i <- function(i, data, draws) { val <- dbinom(data$y, size = data$trials, prob = .mu(data,draws), log = TRUE) .weighted(val, data$weights) diff --git a/R/misc.R b/R/misc.R index bfb10d332..6e2bf6ff1 100644 --- a/R/misc.R +++ b/R/misc.R @@ -138,6 +138,13 @@ is.mer <- function(x) { isTRUE(check1 && check2) } +# Test if stanreg object used stan_lmList +# +# @param x A stanreg object. +is.lmList <- function(x) { + inherits(x, "lmList") +} + # Consistent error message to use when something is only available for # models fit using MCMC # diff --git a/R/posterior_predict.R b/R/posterior_predict.R index 81446d0cb..b66a46cf6 100644 --- a/R/posterior_predict.R +++ b/R/posterior_predict.R @@ -159,6 +159,7 @@ posterior_predict <- function(object, newdata = NULL, draws = NULL, else ppargs <- pp_args(object, data = pp_eta(object, dat, draws)) if (!is(object, "polr") && is.binomial(family(object)$family)) ppargs$trials <- pp_binomial_trials(object, newdata) + if (inherits(object, "lmList")) ppargs$group <- object$group ppfun <- pp_fun(object) ytilde <- do.call(ppfun, ppargs) @@ -175,6 +176,7 @@ posterior_predict <- function(object, newdata = NULL, draws = NULL, # functions to draw from the various posterior predictive distributions pp_fun <- function(object) { suffix <- if (is(object, "polr")) "polr" else family(object)$family + if (inherits(object, "lmList")) suffix <- paste0(suffix, "_grouped") get(paste0(".pp_", suffix), mode = "function") } @@ -183,6 +185,11 @@ pp_fun <- function(object) { rnorm(ncol(mu), mu[s,], sigma[s]) })) } +.pp_gaussian_grouped <- function(mu, sigma, group) { + t(sapply(1:nrow(mu), function(s) { + rnorm(ncol(mu), mu[s,], sigma[s,group]) + })) +} .pp_binomial <- function(mu, trials) { t(sapply(1:nrow(mu), function(s) { rbinom(ncol(mu), size = trials, prob = mu[s, ]) @@ -260,7 +267,7 @@ pp_args <- function(object, data) { args <- list(mu = inverse_link(eta)) famname <- family(object)$family if (is.gaussian(famname)) { - args$sigma <- stanmat[, "sigma"] + args$sigma <- stanmat[, grep("^sigma", colnames(stanmat))] } else if (is.gamma(famname)) { args$shape <- stanmat[, "shape"] } else if (is.ig(famname)) { diff --git a/R/pp_data.R b/R/pp_data.R index 7c80cf6ea..e70ca89db 100644 --- a/R/pp_data.R +++ b/R/pp_data.R @@ -31,13 +31,26 @@ pp_data <- function(object, newdata = NULL, re.form = NULL, ...) { return(nlist(x, offset)) } tt <- terms(object) - Terms <- delete.response(tt) - m <- model.frame(Terms, newdata, xlev = object$xlevels) - if (!is.null(cl <- attr(Terms, "dataClasses"))) - .checkMFClasses(cl, m) - x <- model.matrix(Terms, m, contrasts.arg = object$contrasts) - if (is(object, "polr") && !is_scobit(object)) - x <- x[,colnames(x) != "(Intercept)", drop = FALSE] + if (inherits(object, "lmList")) { + f <- as.character(object$formula) + f <- as.formula(paste(f[2], "~", "-1 + (", f[3], ")")) + g <- glFormula(f, newdata) + modelframe <- g$fr + x <- t(g$reTrms$Zt) + group_names <- levels(object$groups) + K <- NCOL(x) / length(group_names) + colnames(x) <- c(sapply(group_names, FUN = function(g) paste0(1:K, ":", g))) + x <- x[,sort(colnames(x))] + } + else { + Terms <- delete.response(tt) + m <- model.frame(Terms, newdata, xlev = object$xlevels) + if (!is.null(cl <- attr(Terms, "dataClasses"))) + .checkMFClasses(cl, m) + x <- model.matrix(Terms, m, contrasts.arg = object$contrasts) + if (is(object, "polr") && !is_scobit(object)) + x <- x[,colnames(x) != "(Intercept)", drop = FALSE] + } offset <- rep(0, nrow(x)) if (!is.null(off.num <- attr(tt, "offset"))) { for (i in off.num) { diff --git a/R/priors.R b/R/priors.R index edc5cc142..def4d49c0 100644 --- a/R/priors.R +++ b/R/priors.R @@ -400,12 +400,14 @@ make_eta <- function(location, what = c("mode", "mean", "median", "log"), K) { half_K <- K / 2 if (what == "mode") { stopifnot(location > 0, location <= 1) - if (K <= 2) - stop(paste("R2 prior error.", + if (K <= 2) { + if (location == 0.5 && K == 2) what <- "mean" + else stop(paste("R2 prior error.", "The mode of the beta distribution does not exist", "with fewer than three predictors.", "Specify 'what' as 'mean', 'median', or 'log' instead."), call. = FALSE) + } eta <- (half_K - 1 - location * half_K + location * 2) / location } else if (what == "mean") { stopifnot(location > 0, location < 1) diff --git a/R/rstanarm-package.R b/R/rstanarm-package.R index 9456f9cc5..a4e380e57 100644 --- a/R/rstanarm-package.R +++ b/R/rstanarm-package.R @@ -116,11 +116,14 @@ #' overview: #' #' \describe{ -#' \item{\code{\link{stan_lm}}, \code{stan_aov}, \code{stan_biglm}}{ -#' Similar to \code{\link[stats]{lm}} or \code{\link[stats]{aov}} but with -#' novel regularizing priors on the model parameters that are driven by prior -#' beliefs about \eqn{R^2}, the proportion of variance in the outcome -#' attributable to the predictors in a linear model. +#' \item{\code{\link{stan_lm}}, \code{stan_aov}, \code{stan_biglm}, \code{stan_lmList}}{ +#' Similar to \code{\link[stats]{lm}} but with novel regularizing priors on the model +#' parameters that are driven by prior beliefs about \eqn{R^2}, the proportion of +#' variance in the outcome attributable to the predictors in a linear model. The +#' \code{stan_aov} variant estimates ANOVA models, the \code{stan_biglm} variant +#' estimates linear models when the design matrix is too large to fit into memory, and +#' the \code{stan_lmList} variant allows all the parameters to vary by group but utilizes +#' hyperpriors to shrink them toward common parameters, much like \code{stan_lmer}. #' } #' \item{\code{\link{stan_glm}}, \code{stan_glm.nb}}{ #' Similar to \code{\link[stats]{glm}} but with Gaussian, Student t, Cauchy @@ -139,7 +142,9 @@ #' a highly-structured but unknown covariance matrix (for which \pkg{rstanarm} #' introduces an innovative prior distribution). MCMC provides more #' appropriate estimates of uncertainty for models that consist of a mix of -#' common and group-specific parameters. +#' common and group-specific parameters. The same line or argument applies even +#' more in the case of \code{stan_lmList} mentioned in the first stanza of this +#' section. #' } #' \item{\code{\link{stan_gamm4}}}{ #' Similar to \code{\link[gamm4]{gamm4}} in the \pkg{gamm4} package, which diff --git a/R/stan_biglm.R b/R/stan_biglm.R index 9b9f9547a..0b5a2dd66 100644 --- a/R/stan_biglm.R +++ b/R/stan_biglm.R @@ -27,11 +27,17 @@ #' intercept and must utilize \emph{centered} but not \emph{standardized} #' predictors. See the Details section or the Example. #' @param xbar A numeric vector of means in the implicit design matrix for -#' the observations included in the model +#' the observations included in the model or --- in the case of +#' \code{stan_biglm.fit} only --- a list of such vectors with one list element +#' for each group #' @param ybar A numeric scalar indicating the same mean of the outcome for -#' the observations included in the model +#' the observations included in the model or --- in the case of +#' \code{stan_biglm.fit} only --- a numeric vector of such means with one +#' element for each group #' @param s_y A numeric scalar indicating the unbiased sample standard deviation -#' of the outcome for the observations included in the model +#' of the outcome for the observations included in the model or --- in the case +#' of \code{stan_biglm.fit} only --- a numeric vector of such standard deviations +#' with one element for each group #' @param has_intercept A logical scalar indicating whether to add an intercept #' to the model when estimating it #' @template args-dots @@ -69,6 +75,14 @@ #' The sample mean and sample standard deviation of the outcome must also #' be passed. #' +#' The \code{stan_biglm} function calls \code{stan_biglm.fit}, although the +#' latter can be called directly. The first seven arguments to the +#' \code{stan_biglm.fit} may be lists of the same length where the list length +#' is equal to the number of mutually exclusive and exhaustive groups that +#' the data have been stratified by. The \code{\link{stan_lmList}} function +#' provides a more conventional interface for a stratified linear regression +#' model and calls \code{stan_biglm.fit} internally. +#' #' @return The output of both \code{stan_biglm} and \code{stan_biglm.fit} is an object of #' \code{\link[rstan]{stanfit-class}} rather than \code{\link{stanreg-objects}}, #' which is more limited and less convenient but necessitated by the fact that @@ -98,7 +112,7 @@ stan_biglm <- function(biglm, xbar, ybar, s_y, has_intercept = TRUE, ..., R <- sqrt(biglm$qr$D) * R return(stan_biglm.fit(b, R, SSR = biglm$qr$ss, N = biglm$n, xbar, ybar, s_y, has_intercept, ..., - prior = prior, prior_intercept = prior_intercept, + prior = prior, prior_intercept = prior_intercept, kappa_mean = 0, prior_PD = prior_PD, algorithm = algorithm, adapt_delta = adapt_delta)) } diff --git a/R/stan_biglm.fit.R b/R/stan_biglm.fit.R index 655e42bb2..bfe327c5e 100644 --- a/R/stan_biglm.fit.R +++ b/R/stan_biglm.fit.R @@ -17,10 +17,17 @@ #' @rdname stan_biglm #' @export -#' @param b A numeric vector of OLS coefficients, excluding the intercept -#' @param R A square upper-triangular matrix from the QR decomposition of the design matrix +#' @param b A numeric vector of OLS coefficients --- excluding the intercept --- +#' or a list of such vectors with one list element for each group +#' @param R A square upper-triangular matrix from the QR decomposition of the +#' centered design matrix or a list of such matrices with one list element for +#' each group #' @param SSR A numeric scalar indicating the sum-of-squared residuals for OLS -#' @param N A integer scalar indicating the number of included observations +#' or a numeric vector of sums-of-squared residuals with one element for each +#' group +#' @param N A integer scalar indicating the number of included observations or +#' an integer array of sample sizes with one element for each group +#' @template args-kappa_mean #' @examples #' # create inputs #' ols <- lm(mpg ~ wt + qsec + am - 1, # next line is critical for centering @@ -39,23 +46,41 @@ #' cbind(lm = b, stan_lm = rstan::get_posterior_mean(post)[14:16]) # shrunk stan_biglm.fit <- function(b, R, SSR, N, xbar, ybar, s_y, has_intercept = TRUE, ..., prior = R2(stop("'location' must be specified")), - prior_intercept = NULL, prior_PD = FALSE, + prior_intercept = NULL, kappa_mean = 1, prior_PD = FALSE, algorithm = c("sampling", "meanfield", "fullrank"), adapt_delta = NULL) { - - J <- 1L - N <- array(N, c(J)) - K <- ncol(R) - cn <- names(xbar) - if (is.null(cn)) cn <- names(b) - R_inv <- backsolve(R, diag(K)) - JK <- c(J, K) - xbarR_inv <- array(c(xbar %*% R_inv), JK) - Rb <- array(R %*% b, JK) - SSR <- array(SSR, J) - s_Y <- array(s_y, J) - center_y <- if (isTRUE(all.equal(matrix(0, J, K), xbar))) ybar else 0.0 - ybar <- array(ybar, J) + if (is.list(b)) { + J <- length(b) + K <- ncol(R[[1]]) + cn <- names(xbar[[1]]) + if (is.null(cn)) cn <- names(b[[1]]) + I <- diag(K) + R_inv <- sapply(R, simplify = FALSE, FUN = backsolve, x = I) + center_y <- 0.0 + xbarR_inv <- Rb <- vector("list", J) + names(xbarR_inv) <- names(Rb) <- names(b) + for (j in 1:J) { + xbarR_inv[[j]] <- c(xbar[[j]] %*% R_inv[[j]]) + Rb[[j]] <- c(R[[j]] %*% b[[j]]) + } + } + else { + J <- 1L + N <- array(N, c(J)) + K <- ncol(R) + cn <- names(xbar) + if (is.null(cn)) cn <- names(b) + I <- diag(K) + R_inv <- backsolve(R, I) + JK <- c(J, K) + xbarR_inv <- array(c(xbar %*% R_inv), JK) + Rb <- array(R %*% b, JK) + SSR <- array(SSR, J) + s_y <- array(s_y, J) + center_y <- if (isTRUE(all.equal(matrix(0, J, K), xbar))) ybar else 0.0 + ybar <- array(ybar, J) + dim(R_inv) <- c(J, dim(R_inv)) + } if (!length(prior)) { prior_dist <- 0L @@ -77,23 +102,28 @@ stan_biglm.fit <- function(b, R, SSR, N, xbar, ybar, s_y, has_intercept = TRUE, if (is.null(prior_scale_for_intercept)) prior_scale_for_intercept <- 0 } - dim(R_inv) <- c(J, dim(R_inv)) # initial values - R2 <- array(1 - SSR[1] / ((N - 1) * s_Y^2), J) + if (J == 1) R2 <- array(1 - SSR[1] / ((N - 1) * s_y^2), J) + else R2 <- 1 - SSR / ((N - 1) * s_y^2) log_omega <- array(0, ifelse(prior_PD == 0, J, 0)) init_fun <- function(chain_id) { out <- list(R2 = R2, log_omega = log_omega) if (has_intercept == 0L) out$z_alpha <- double() + if (J == 1) { + out$SMC <- double() + out$mu <- array(0, c(0, K)) + out$kappa <- double() + } return(out) } stanfit <- stanmodels$lm standata <- nlist(K, has_intercept, prior_dist, prior_dist_for_intercept, prior_mean_for_intercept, - prior_scale_for_intercept, + prior_scale_for_intercept, kappa_mean, prior_PD, eta, J, N, xbarR_inv, - ybar, center_y, s_Y, Rb, SSR, R_inv) + ybar, center_y, s_Y = s_y, Rb, SSR, R_inv) pars <- c(if (has_intercept) "alpha", "beta", "sigma", if (prior_PD == 0) "log_omega", "R2", "mean_PPD") algorithm <- match.arg(algorithm) @@ -109,9 +139,18 @@ stan_biglm.fit <- function(b, R, SSR, N, xbar, ybar, s_y, has_intercept = TRUE, init = init_fun, data = standata, pars = pars, show_messages = FALSE) stanfit <- do.call(sampling, sampling_args) } - new_names <- c(if (has_intercept) "(Intercept)", cn, "sigma", - if (prior_PD == 0) "log-fit_ratio", - "R2", "mean_PPD", "log-posterior") + if (J == 1) new_names <- c(if (has_intercept) "(Intercept)", cn, "sigma", + if (prior_PD == 0) "log-fit_ratio", + "R2", "mean_PPD", "log-posterior") + else { + group_names <- names(b) + new_names <- c(if (has_intercept) paste0("(Intercept):", group_names), + t(sapply(group_names, FUN = function(g) paste0(cn, ":", g))), + paste0("sigma:", group_names), + if (prior_PD == 0) paste0("log-fit_ratio:", group_names), + paste0("R2:", group_names), paste0("mean_PPD:", group_names), + "log-posterior") + } stanfit@sim$fnames_oi <- new_names return(stanfit) } diff --git a/R/stan_lm.fit.R b/R/stan_lm.fit.R index 6eaff7df4..366071f14 100644 --- a/R/stan_lm.fit.R +++ b/R/stan_lm.fit.R @@ -62,6 +62,7 @@ stan_lm.wfit <- function(x, y, w, offset = NULL, singular.ok = TRUE, ..., N = nrow(x), xbar = xbar, ybar = ybar, s_y = sd(y), has_intercept = has_intercept, ..., prior = prior, prior_intercept = prior_intercept, + kappa_mean = 0, prior_PD = prior_PD, algorithm = algorithm, adapt_delta = adapt_delta)) diff --git a/R/stan_lmList.R b/R/stan_lmList.R new file mode 100644 index 000000000..49176f0a1 --- /dev/null +++ b/R/stan_lmList.R @@ -0,0 +1,107 @@ +# Part of the rstanarm package for estimating model parameters +# Copyright (C) 2016 Trustees of Columbia University +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +#' Bayesian regularized linear but stratified models via Stan +#' +#' This model uses the same likelihood and syntax as the \code{\link[lme4]{lmList}} +#' function in the \pkg{lme4} package, which allows the parameters of a linear +#' regression model to vary according to a mutually exclusive and exhaustive grouping +#' factor, but differs fundamentally in that group-level parameters have hyperpriors +#' that shrink --- to some estimated extent --- the group-level parameters toward +#' common parameters. Thus, \code{stan_lmList} is more like a \code{\link[lme4]{lmer}} +#' model with a sinking grouping variable that pertains too all the regression +#' coefficients and the intercept (and, unlike \code{\link[lme4]{lmer}}, the +#' standard deviation of the errors). +#' +#' @export +#' @param formula,data,family Same as in \code{\link[lme4]{lmList}} but +#' \code{family} must be omitted or \code{\link[stats]{gaussian}} +#' @param subset,weights,na.action,offset Same as in \code{\link[lme4]{lmList}} +#' but rarely specified +#' @template args-dots +#' @param prior,prior_intercept Same as in \code{\link{stan_lm}} +#' @template args-kappa_mean +#' @template args-prior_PD +#' @template args-algorithm +#' @template args-adapt_delta +#' +#' @examples +#' \dontrun{ +#' post <- stan_lmList(mpg ~ disp + wt + hp | cyl, data = mtcars, +#' prior = R2(0.25)) +#' coef(post) # posterior medians stratified by number of cylinders +#' } +stan_lmList <- function(formula, data, family, subset, weights, na.action, offset, + ..., prior = R2(stop("'location' must be specified")), + prior_intercept = NULL, kappa_mean = 1, prior_PD = FALSE, + algorithm = c("sampling", "meanfield", "fullrank"), + adapt_delta = NULL) { + f <- as.character(formula) + f <- as.formula(paste(f[2], "~", "-1 + (", f[3], ")")) + isGLM <- !missing(family) + if (isGLM) stop("GLMs are not currently supported so omit the 'family' argument") + mCall <- match.call(expand.dots = FALSE) + mCall$prior <- mCall$prior_intercept <- mCall$prior_PD <- mCall$algorithm <- + mCall$adapt_delta <- mCall$`...` <- NULL + mCall$pool <- FALSE + mCall$y <- TRUE + mCall[[1L]] <- quote(lme4::lmList) + lms <- eval.parent(mCall) + group_names <- levels(lms@groups) + has_intercept <- identical(attr(lms@.Data[[1L]]$terms, "intercept"), 1L) + if (has_intercept) { + b <- lapply(lms@.Data, FUN = function(ols) coef(ols)[-1L]) + X <- lapply(lms@.Data, FUN = function(ols) model.matrix(ols)[,-1L, drop = FALSE]) + } + else { + b <- lapply(lms@.Data, FUN = coef) + X <- lapply(lms@.Data, FUN = model.matrix) + } + names(b) <- names(X) <- group_names + R <- sapply(X, simplify = FALSE, FUN = function(x) { + qr.R(qr(sweep(x, MARGIN = 2, STATS = colMeans(x)))) + }) + SSR <- sapply(lms@.Data, FUN = function(ols) crossprod(residuals(ols))[1]) + names(SSR) <- group_names + N <- sapply(X, FUN = NROW) + xbar <- sapply(X, simplify = FALSE, FUN = colMeans) + ybar <- sapply(lms@.Data, FUN = function(ols) mean(ols$y)) + s_y <- sapply(lms@.Data, FUN = function(ols) sd(ols$y)) + names(ybar) <- names(s_y) <- group_names + algorithm <- match.arg(algorithm) + stanfit <- stan_biglm.fit(b, R, SSR, N, xbar, ybar, s_y, has_intercept, ..., + prior = prior, prior_intercept = prior_intercept, + kappa_mean = kappa_mean, prior_PD = prior_PD, + algorithm = algorithm, adapt_delta = adapt_delta) + g <- glFormula(f, data) + modelframe <- g$fr + Z <- t(g$reTrms$Zt) + K <- NCOL(X[[1]]) + has_intercept + colnames(Z) <- c(sapply(group_names, FUN = function(g) paste0(1:K, ":", g))) + Z <- Z[,sort(colnames(Z))] + fit <- nlist(stanfit, family = gaussian(), formula, offset = NULL, weights = NULL, + x = Z, y = modelframe[,1], data = if (missing("data")) environment(formula) else data, + prior.info = prior, + algorithm, call = match.call(), terms = attr(g$fr, "terms"), + model = NULL, + na.action = attr(modelframe, "na.action"), + contrasts = attr(modelframe, "contrasts")) + out <- stanreg(fit) + out$groups <- lms@groups + class(out) <- c("stanreg", "lmList") + return(out) +} diff --git a/R/stanreg-methods.R b/R/stanreg-methods.R index 577407ca8..298e89b83 100644 --- a/R/stanreg-methods.R +++ b/R/stanreg-methods.R @@ -85,7 +85,16 @@ NULL coef.stanreg <- function(object, ...) { if (is.mer(object)) return(coef_mer(object, ...)) - + if (is.lmList(object)) { + J <- nlevels(object$groups) + beta <- object$coefficients + mark <- grep("(Intercept)", names(beta), fixed = TRUE) + alpha <- beta[mark] + mat <- cbind(alpha, matrix(beta[-mark], nrow = J, byrow = TRUE)) + rownames(mat) <- levels(object$groups) + colnames(mat) <- unique(gsub(":.*$", "", names(beta))) + return(mat) + } object$coefficients } @@ -272,8 +281,17 @@ coef_mer <- function(object, ...) { #' @export #' @export fixef #' @importFrom lme4 fixef -#' fixef.stanreg <- function(object, ...) { + if (is.lmList(object)) { + means <- get_posterior_mean(object$stanfit) + means <- means[,ncol(means)] + nms <- names(means) + J <- nlevels(object$groups) + FE <- colMeans(cbind(means[grep("^alpha", nms)], + matrix(means[grep("^beta", nms)], nrow = J, byrow = TRUE))) + names(FE) <- colnames(coef(object)) + return(FE) + } coefs <- object$coefficients coefs[b_names(names(coefs), invert = TRUE)] } @@ -293,6 +311,20 @@ ngrps.stanreg <- function(object, ...) { #' @importFrom lme4 ranef #' ranef.stanreg <- function(object, ...) { + if (is.lmList(object)) { + means <- get_posterior_mean(object$stanfit) + means <- means[,ncol(means)] + nms <- names(means) + J <- nlevels(object$groups) + effs <- cbind(means[grep("^alpha", nms)], + matrix(means[grep("^beta", nms)], nrow = J, byrow = TRUE)) + colnames(effs) <- colnames(coef(object)) + effs <- as.data.frame(sweep(effs, MARGIN = 2, STATS = colMeans(effs))) + attr(effs, "label") <- "Random effects" + attr(effs, "standardized") <- FALSE + class(effs) <- "ranef.lmList" + return(effs) + } all_names <- if (used.optimizing(object)) rownames(object$stan_summary) else object$stanfit@sim$fnames_oi sel <- b_names(all_names) @@ -327,6 +359,11 @@ ranef.stanreg <- function(object, ...) { #' importFrom(lme4,sigma) #' sigma.stanreg <- function(object, ...) { + if (is.lmList(object)) { + message("this could take a little while to simulate") + return(sqrt(mean( (posterior_predict(object) - + as.matrix(posterior_linpred(object)))^2 ))) + } if (!("sigma" %in% rownames(object$stan_summary))) return(1) diff --git a/cleanup b/cleanup index bf4430d39..e94582df8 100755 --- a/cleanup +++ b/cleanup @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/sh -e # Note to Windows users: This is not actually platform specific. mkdir -p src/include diff --git a/cleanup.win b/cleanup.win index bf4430d39..e94582df8 100755 --- a/cleanup.win +++ b/cleanup.win @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/sh -e # Note to Windows users: This is not actually platform specific. mkdir -p src/include diff --git a/exec/lm.stan b/exec/lm.stan index 1868363ab..59da1d8c2 100644 --- a/exec/lm.stan +++ b/exec/lm.stan @@ -23,6 +23,72 @@ functions { N * (log(sigma) + 0.91893853320467267); return target(); } + + /** + * Modded Bessel function of the first kind in log units + * @param x Real non-negative scalar + * @param v Real non-negative order but may be integer + */ + real log_besselI(real x, real v) { + real log_half_x; + real lfac; + real lgam; + real lcons; + real biggest; + real piece; + real smallest; + real summand; + int m; + if (x < 0) reject("x is assumed to be non-negative") + if (v < 0) reject("v is assumed to be non-negative"); + log_half_x = log(0.5 * x); + lfac = 0; + lgam = lgamma(v + 1); + lcons = v * log_half_x; + biggest = -lgam + lcons; + piece = positive_infinity(); + smallest = -745.13321910194122211; // exp(smallest) > 0 minimally + summand = 0.0; + m = 1; + while (piece > smallest) { + lfac = lfac + log(m); + lgam = lgam + log(v + m); + lcons = lcons + 2 * log_half_x; + piece = -lfac - lgam + lcons - biggest; + summand = summand + exp(piece); + m = m + 1; + } + return biggest + log1p(summand); + } + + /** + * von Mises-Fisher distribution in log units + * @param u J-array of unit vectors + * @param mu Unit vector that is the expectation for u + * @param kappa Non-negative concentration parameter + */ + real VMF(vector[] u, vector mu, real kappa) { + int p; + real half_p; + real half_pm1; + int J; + real out; + p = rows(mu); + if (p < 2) reject("p must be >= 2"); + half_p = 0.5 * p; + J = size(u); + if (kappa == 0) return -J * half_p * 1.8378770664093453391; + half_pm1 = half_p - 1; + out = 0; + for (j in 1:J) out = out + dot_product(mu, u[j]); + out = out * kappa; + if (p == 2) + return J * (-1.8378770664093453391 - log(modified_bessel_first_kind(0, kappa))) + out; + if (p == 3) + return J * (log(kappa) - log(12.566370614359172464 * sinh(kappa))) + out; + return J * ( half_pm1 * log(kappa) - half_p * 1.8378770664093453391 + - log_besselI(kappa, half_pm1) ) + out; + } } data { int has_intercept; // 0 = no, 1 = yes @@ -30,6 +96,7 @@ data { real prior_scale_for_intercept; // 0 = by CLT real prior_mean_for_intercept; // expected value for alpha int prior_dist; // 0 = uniform for R^2, 1 = Beta(K/2,eta) + real kappa_mean; // prior expectation of concentration parameter int prior_PD; // 0 = no, 1 = yes to drawing from the prior real eta; // shape hyperparameter @@ -59,6 +126,9 @@ parameters { // must not call with init="0" unit_vector[K] u[J]; // primitives for coefficients real z_alpha[J * has_intercept]; // primitives for intercepts real R2[J]; // proportions of variance explained + real SMC[J > 1]; // proportion of variance explained overall + unit_vector[K] mu[J > 1]; // global location parameter + real kappa[J > 1]; // unscaled concentration parameter vector[J * (1 - prior_PD)] log_omega; // under/overfitting factors } transformed parameters { @@ -97,12 +167,22 @@ model { has_intercept == 1 ? alpha[j] + shift : shift, ybar[j], SSR[j], sigma[j], N[j]); } - // implicit: u[j] is uniform on the surface of a hypersphere } if (has_intercept == 1 && prior_dist_for_intercept > 0) target += normal_lpdf(z_alpha | 0, 1); - if (prior_dist == 1) target += beta_lpdf(R2 | half_K, eta); + if (J == 1) { + if (prior_dist == 1) target += beta_lpdf(R2 | half_K, eta); + // implicit: u[1] is uniform on the surface of a hypersphere + } + else { + target += beta_lpdf(R2 | half_K, half_K * (inv(SMC[1]) - 1)); + if (prior_dist == 1) target += beta_lpdf(SMC[1] | half_K, eta); + target += VMF(u, mu[1], kappa_mean * kappa[1]); + // implicit: mu is uniform on the surface of a hyperspere + target += exponential_lpdf(kappa | 1); + } // implicit: log_omega is uniform over the real line for all j + } generated quantities { real mean_PPD[J]; diff --git a/man-roxygen/args-kappa_mean.R b/man-roxygen/args-kappa_mean.R new file mode 100644 index 000000000..b9ed69dbd --- /dev/null +++ b/man-roxygen/args-kappa_mean.R @@ -0,0 +1,6 @@ +#' @param kappa_mean A positive scalar indicating the expectation of the concentration +#' parameter in a von Mises-Fisher distribution where the random variable in question +#' is each group's unit vector underlying its regression coefficients. See the vignette +#' for \code{\link{stan_lm}} for more details. Defaults to \eqn{1} but is ignored if +#' there is only one group. A higher value produces more shrinkage toward the common unit +#' vector while a lower value makes the groups closer to independent. diff --git a/tests/testthat/test_stan_functions.R b/tests/testthat/test_stan_functions.R index f59493bd0..b0d157fda 100644 --- a/tests/testthat/test_stan_functions.R +++ b/tests/testthat/test_stan_functions.R @@ -296,6 +296,49 @@ test_that("ll_mvn_ols_qr_lp returns expected results", { SSR, sigma, N))) }) +context("lm") +test_that("log_besselI returns expected results", { + x <- exp(1) + v <- 10 * pi + expect_equal(log(besselI(x, v)), log_besselI(x, v)) +}) + +context("lm") +test_that("VMF returns expected results", { + dVMF <- function(u, mu, kappa) { + p <- length(mu) + if (is.list(u)) J <- length(u) + else J <- 1 + log_C <- (0.5 * p - 1) * log(kappa) - 0.5 * p * log(2 * pi) - log(besselI(kappa, 0.5 * p - 1)) + if (J > 1) return(log_C * J + kappa * sum(sapply(u, `%*%`, x = mu))) + else return(log_C + kappa * c(mu %*% u)) + } + kappa <- rexp(1) + mu <- rnorm(4) + mu <- mu / sqrt(crossprod(mu)[1]) + J <- 3 + u <- lapply(1:J, FUN = function(j) { + u <- rnorm(length(mu)) + u <- u / sqrt(crossprod(u)[1]) + return(u) + }) + expect_equal(dVMF(u, mu, kappa), VMF(u, mu, kappa)) + mu <- rnorm(3) + mu <- mu / sqrt(crossprod(mu)[1]) + J <- 4 + u <- lapply(1:J, FUN = function(j) { + u <- rnorm(length(mu)) + u <- u / sqrt(crossprod(u)[1]) + return(u) + }) + expect_equal(dVMF(u, mu, kappa), VMF(u, mu, kappa)) + mu <- rnorm(2) + mu <- mu / sqrt(crossprod(mu)[1]) + u <- rnorm(length(mu)) + u <- u / sqrt(crossprod(u)[1]) + expect_equal(dVMF(u, mu, kappa), VMF(list(u), mu, kappa)) +}) + # polr links <- c("logistic", "probit", "loglog", "cloglog", "cauchit") context("polr") diff --git a/tests/testthat/test_stan_lm.R b/tests/testthat/test_stan_lm.R index dcd13e797..9971a370b 100644 --- a/tests/testthat/test_stan_lm.R +++ b/tests/testthat/test_stan_lm.R @@ -92,6 +92,11 @@ test_that("stan_lm throws error if glmer syntax used", { regexp = "model formula not allowed") }) +test_that("stan_lmList does not throw an error", { + stan_lmList(mpg ~ disp + wt + hp | cyl, data = mtcars, + prior = R2(0.5, "mean"), chains = 1, iter = 1) +}) + context("stan_aov") test_that("stan_aov returns expected result for npk example", { fit <- stan_aov(yield ~ block + N*P*K, data = npk, contrasts = "contr.poly", diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt index 607cd5771..2a450e1f5 100644 --- a/vignettes/children/SETTINGS-rstan.txt +++ b/vignettes/children/SETTINGS-rstan.txt @@ -3,4 +3,5 @@ ITER <- 500L CHAINS <- 2L CORES <- 2L SEED <- 12345 +set.seed(SEED) ``` diff --git a/vignettes/lm.Rmd b/vignettes/lm.Rmd index 3112f5b43..a35628c50 100644 --- a/vignettes/lm.Rmd +++ b/vignettes/lm.Rmd @@ -282,35 +282,112 @@ We can compare the two approaches using an approximation to Leave-One-Out __loo__ package. ```{r lm-clouds-loo, warning=TRUE} -(loo_post <- loo(post)) -(loo(simple)) +(loo_post <- loo(post, k_threshold = 0.7)) +(loo_simple <- loo(simple, k_threshold = 0.7)) ``` The results indicate that the first approach is expected to produce better -out-of-sample predictions but the Warning messages are at least as important. -Many of the estimated shape parameters for the Generalized Pareto distribution -are above $0.5$ in the model with Cauchy priors, which indicates that these +out-of-sample predictions. We specified `k_threshold = 0.7` because $7$ of the +original estimated shape parameters for the Generalized Pareto distribution +were above $0.7$ in the model with Cauchy priors, which indicates that these estimates are only going to converge slowly to the true out-of-sample deviance measures. Thus, with only $24$ observations, they should not be considered -reliable. The more complicated prior derived above is stronger --- as -evidenced by the fact that the effective number of parameters is about half -of that in the simpler approach and $12$ for the maximum likelihood estimator ---- and only has $4$ out of $24$ shape estimates in the "danger zone". We -might want to reexamine these observations - +reliable. By specifying `k_threshold = 0.7`, the model is refit $7$ times, +leaving out each of the problematic observations in turn, and the prediction +of the left-out observation is incorporated into the `elpd_loo`, `p_loo`, and +`looic` estimates. + +The more complicated prior derived above is stronger --- as evidenced by the +fact that the effective number of parameters is about $6.3$ as compared to +$10.3$ for the simpler approach and $12$ for the maximum likelihood estimator. +A plot reveals that there are a couple of observation whose estimated shape +parameter for the Generalized Pareto distribution but these can be ignored. ```{r lm-clouds-plot-loo} plot(loo_post, label_points = TRUE) ``` -because the posterior is sensitive to them but, overall, the results seem -tolerable. - In general, we would expect the joint prior derived here to work better when there are many predictors relative to the number of observations. Placing independent, heavy-tailed priors on the coefficients neither reflects the beliefs of the researcher nor conveys enough information to stabilize all the computations. +# Hierarchical Approach + +Another approach would be to allow _all_ the parameters to vary by group. +In this case, the natural grouping is by treatment and control but the +`stan_lmList` function --- like the `lmList` function in the __lme4__ +package it is based on --- permits any number of levels on a single grouping +factor. However, `stan_lmList` differs fundamentally from `lmList` because +the latter estimates Ordinary Least Squares for each stratum of the data +defined by the levels of the grouping variable whereas `stan_lmList` estimates +a model with the data from all groups simultaneously but puts a hierarchical +prior on the group-specific unknowns. + +Recall that one of the primitive parameters in `stan_lm` is $\mathbf{u}$, which +was assumed to be uniformally distributed on the surface of a hypersphere. For +`stan_lmList`, we index $\mathbf{u}_j$ by group and place a von Mises-Fisher +prior on it, whose PDF is +$$f\left(\mathbf{u}_j | \boldsymbol{\mu}, \kappa \right) = +\frac{\kappa^{\frac{K}{2} - 1}}{\left(2 \pi\right)^{\frac{K}{2}} I_{\frac{K}{2} - 1}\left(\kappa\right)} +e^{\kappa \boldsymbol{\mu}^\top \mathbf{u}_j}$$ +where $\mathbf{\mu}$ is the prior mean on the surface of a hypersphere +$\kappa \geq 0$ is a concentration parameter, and +$I_{\frac{K}{2} - 1}\left(\kappa\right)$ is the modified Bessel function of the +first kind. If $\kappa = 0$, then this density is constant and the greater is +$\kappa$, the more each $\mathbf{u}_j$ is shrunk toward $\mathbf{\mu}$. Of course, +$\mathbf{\mu}$ and $\kappa$ are unknown, need to be estimated, and require +priors. The prior for $\mathbf{\mu}$ is uniform on the surface of a hypersphere +and cannot be changed in `stan_lmList`, while the prior for $\kappa$ is +exponential whose mean is specified by the `kappa_mean` argument to `stan_lmList`, +which by default is $1$. + +We also specify a hierarchical prior on $R_j^2$, which is the proportion of +variance in the outcome for the $j$-th group that is explained by the model. Recall +that in the `stan_lm` case, we could calculate the second shape parameter, $\eta$, +from the user's prior expectation about $R^2$. In the `stan_lmList` case, we +calculate $\eta$ from $\lambda = \mathbb{E}\left[R_j^2\right]$ and estimate $\lambda$. +Consequently, in the `stan_lmList` function, the user needs to express prior +beliefs about the location of $\lambda$, which can be interpreted as the average +$R^2$ across groups. + +Since $R_j^2$, $\ln \omega_j$, and $s_{Y_j}$ all differ by group, $\sigma_j$ +differs by group. Also, the intercept, $\alpha_j$, differs by group. Thus, when +the user prints the results of a model estimated by `stan_lmList`, all of the +parameters are suffixed by `:j` where `j` can be an integer or preferably a +string indicating the group. + +The `formula` argument to the `stan_lmList` function is somewhat like that for +the `stan_lmer` function, which builds off the `lmer` function in the __lme4__ +package and specifies the grouping variable to the right of a `|`. There is +more information on this syntax in the [vignette](glmer.html) for generalized +linear models with group-specific terms. However, for `stan_lmList` only one +grouping term can (and must) be specified. In the cloud-hacking example, it +would look like +```{r, lm-clouds-grouped, results="hide"} +grouped <- stan_lmList(rainfall ~ sne + cloudcover + prewetness + echomotion + + time | seeding, data = clouds, + prior = R2(location = 0.2), + chains = CHAINS, cores = CORES, seed = SEED) +grouped +``` +```{r, print-lm-clouds-grouped, echo = FALSE} +grouped +``` +Notice now that $\sigma$ for the treated group differs (somewhat) from +$\sigma$ for the control group (at least at the medians) whereas in the +previous two models $\sigma$ was constrained to be equal for both groups. +We can use the `compare` function to compare these models using the LOO +criterion +```{r, loo-3} +compare(loo_post, loo_simple, loo(grouped)) +``` +As can be seen, the model estimated by `stan_lmList` has fewer effective +parameters than the model estimated by `stan_glm` but more effective +parameters than the model estimated by `stan_lm`. However, the fit to the +data is worse with `stan_lmList` in this case so there is no reason to +prefer it over `stan_lm`. + # Conclusion This vignette has discussed the prior distribution utilized in the `stan_lm` From a79f29fd1508760e22d2c81eaead3a0e10d58263 Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Sat, 16 Jul 2016 15:39:43 -0400 Subject: [PATCH 02/12] stan_lmList.R: add details --- R/stan_lmList.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/R/stan_lmList.R b/R/stan_lmList.R index 49176f0a1..725363498 100644 --- a/R/stan_lmList.R +++ b/R/stan_lmList.R @@ -38,7 +38,23 @@ #' @template args-prior_PD #' @template args-algorithm #' @template args-adapt_delta +#' @details +#' The options for the priors are essentially the same as in \code{\link{stan_lm}}. +#' However, in this case the \code{prior} argument pertains to the unknown expected +#' \eqn{R^2} across groups where each group is allowed to have its own \eqn{R^2} +#' along with every other parameter. The coefficients are partially pooled using +#' a von Mises-Fisher prior on the group-specific unit vectors. The expecation of +#' this von Mises-Fisher distribution is uniformally distributed on the surface of +#' a hypersphere, but the concentration parameter, \code{kappa}, has an exponential +#' prior with mean specified by the \code{kappa_mean} argument. Higher values for +#' \code{kappa_mean} induce more pooling and lower values induce less pooling. #' +#' The posterior distribution for each group's \eqn{R^2} is a \emph{within-group} +#' quantity, i.e. the proportion of variance in that group that is explained by the +#' predictors. If the posterior distribution of the intercepts differs from group +#' to group than the proportion of variance in the \emph{entire sample} that is +#' explained by the model is not estimated but will be larger than the average +#' \emph{within-group} \eqn{R^2}. #' @examples #' \dontrun{ #' post <- stan_lmList(mpg ~ disp + wt + hp | cyl, data = mtcars, From e24497bb751aeefe0bbb5784bd68627837b76959 Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Sat, 16 Jul 2016 15:54:19 -0400 Subject: [PATCH 03/12] lm.Rmd: clarify effective parameters count --- vignettes/lm.Rmd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vignettes/lm.Rmd b/vignettes/lm.Rmd index a35628c50..4f9db56fc 100644 --- a/vignettes/lm.Rmd +++ b/vignettes/lm.Rmd @@ -384,9 +384,10 @@ compare(loo_post, loo_simple, loo(grouped)) ``` As can be seen, the model estimated by `stan_lmList` has fewer effective parameters than the model estimated by `stan_glm` but more effective -parameters than the model estimated by `stan_lm`. However, the fit to the -data is worse with `stan_lmList` in this case so there is no reason to -prefer it over `stan_lm`. +parameters than the model estimated by `stan_lm` due to the need to +estimate $\mathbf{\mu}$ and $\kappa$ plus additional $R^2$ parameters. +However, the fit to the data is worse with `stan_lmList` in this case so +there is no reason to prefer it over `stan_lm`. # Conclusion From 0a464c4eb3d6df04bf8398324f9ec8af84d044b1 Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Sat, 16 Jul 2016 16:00:31 -0400 Subject: [PATCH 04/12] SETTINGS-rstan.txt: use only 1 core again --- vignettes/children/SETTINGS-rstan.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt index 2a450e1f5..de94c6b3b 100644 --- a/vignettes/children/SETTINGS-rstan.txt +++ b/vignettes/children/SETTINGS-rstan.txt @@ -1,7 +1,7 @@ ```{r, SETTINGS-rstan, include=FALSE} ITER <- 500L CHAINS <- 2L -CORES <- 2L +CORES <- 1L SEED <- 12345 set.seed(SEED) ``` From 667d8f9f8209a5fd4664ced1b802002cb6691baf Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Mon, 18 Jul 2016 01:58:32 -0400 Subject: [PATCH 05/12] lm.stan: fix unreasonable memory use --- exec/lm.stan | 47 +++++++++++++++++++++++++++++++++++++++++++---- vignettes/lm.Rmd | 2 +- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/exec/lm.stan b/exec/lm.stan index 59da1d8c2..b031d92fe 100644 --- a/exec/lm.stan +++ b/exec/lm.stan @@ -31,25 +31,64 @@ functions { */ real log_besselI(real x, real v) { real log_half_x; + real lfac_0; real lfac; real lgam; + real lgam_0; real lcons; + real lcons_0; real biggest; real piece; real smallest; real summand; int m; + int biggest_m; if (x < 0) reject("x is assumed to be non-negative") if (v < 0) reject("v is assumed to be non-negative"); log_half_x = log(0.5 * x); - lfac = 0; - lgam = lgamma(v + 1); - lcons = v * log_half_x; + lfac_0 = 0; + lfac = lfac_0; + lgam_0 = lgamma(v + 1); + lgam = lgam_0; + lcons_0 = v * log_half_x; + lcons = lcons_0; biggest = -lgam + lcons; - piece = positive_infinity(); + piece = biggest; smallest = -745.13321910194122211; // exp(smallest) > 0 minimally summand = 0.0; m = 1; + while (piece >= biggest) { // find maximum for log-sum-exp + biggest = piece; + lfac = lfac + log(m); + lgam = lgam + log(v + m); + lcons = lcons + 2 * log_half_x; + piece = -lfac - lgam + lcons; + m = m + 1; + } + if (m == 2) summand = exp(piece - biggest); + else if (m > 1000) return x - 0.5 * log(2 * pi() * x); + else { // start over with interior biggest + lfac = lfac_0; + lgam = lgam_0; + lcons = lcons_0; + piece = -lgam + lcons - biggest; + summand = exp(piece); + biggest_m = m - 2; + m = 1; + while (m < biggest_m) { + lfac = lfac + log(m); + lgam = lgam + log(v + m); + lcons = lcons + 2 * log_half_x; + piece = -lfac - lgam + lcons - biggest; + if (piece > smallest) summand = summand + exp(piece); + m = m + 1; + } + // skip over biggest + lfac = lfac + log(m); + lgam = lgam + log(v + m); + lcons = lcons + 2 * log_half_x; + m = m + 1; + } while (piece > smallest) { lfac = lfac + log(m); lgam = lgam + log(v + m); diff --git a/vignettes/lm.Rmd b/vignettes/lm.Rmd index 4f9db56fc..fb54255f8 100644 --- a/vignettes/lm.Rmd +++ b/vignettes/lm.Rmd @@ -368,7 +368,7 @@ would look like grouped <- stan_lmList(rainfall ~ sne + cloudcover + prewetness + echomotion + time | seeding, data = clouds, prior = R2(location = 0.2), - chains = CHAINS, cores = CORES, seed = SEED) + chains = CHAINS, cores = CORES, seed = SEED, iter = ITER) grouped ``` ```{r, print-lm-clouds-grouped, echo = FALSE} From c4d5f6e96f704488c8dbafbdb566789a96ebf5ac Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Mon, 18 Jul 2016 10:46:05 -0400 Subject: [PATCH 06/12] SETTINGS-rstan.txt: use 2 cores again --- vignettes/children/SETTINGS-rstan.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt index de94c6b3b..2a450e1f5 100644 --- a/vignettes/children/SETTINGS-rstan.txt +++ b/vignettes/children/SETTINGS-rstan.txt @@ -1,7 +1,7 @@ ```{r, SETTINGS-rstan, include=FALSE} ITER <- 500L CHAINS <- 2L -CORES <- 1L +CORES <- 2L SEED <- 12345 set.seed(SEED) ``` From 0e4d6a71c15fb22cffcb82e8834d1ed18d213a12 Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Mon, 18 Jul 2016 10:46:24 -0400 Subject: [PATCH 07/12] .travis.yml: try imposing virtual memory limits --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index c31f66e26..bdc70bcb2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,9 @@ before_install: - sed -i.bak "s/ g++/ ${CXX}${CLANG_EXTRA_ARG}/" ~/.R/Makevars - sed -i.bak "s/O[0-3]/O$CXX_OLEVEL/" ~/.R/Makevars +before_script: + - ulimit -v 800000 + after_script: - tar -ztvf rstanarm_*.tar.gz - echo ${NOT_CRAN} From 3fd8e9b5fbdfc9e23ea637cd3bce87d672ad517e Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Mon, 18 Jul 2016 12:17:47 -0400 Subject: [PATCH 08/12] .travis.yml: try installing matrixStats from the develop branch --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index bdc70bcb2..84f948e21 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ cache: packages r_github_packages: - jimhester/covr + - HenrikBengtsson/matrixStats@develop env: matrix: From 6aedecc14ae7ca855512c5eb484ab7c792c3396a Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Mon, 18 Jul 2016 13:31:31 -0400 Subject: [PATCH 09/12] .travis.yml: try a different ulimit --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 84f948e21..5abe5a50e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ before_install: - sed -i.bak "s/O[0-3]/O$CXX_OLEVEL/" ~/.R/Makevars before_script: - - ulimit -v 800000 + - ulimit -v 7000000 after_script: - tar -ztvf rstanarm_*.tar.gz From f8369734818e1f910e99b955b09e378584091187 Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Mon, 18 Jul 2016 13:53:52 -0400 Subject: [PATCH 10/12] SETTINGS-rstan.txt: try 1 core again --- vignettes/children/SETTINGS-rstan.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt index 2a450e1f5..de94c6b3b 100644 --- a/vignettes/children/SETTINGS-rstan.txt +++ b/vignettes/children/SETTINGS-rstan.txt @@ -1,7 +1,7 @@ ```{r, SETTINGS-rstan, include=FALSE} ITER <- 500L CHAINS <- 2L -CORES <- 2L +CORES <- 1L SEED <- 12345 set.seed(SEED) ``` From 9a30101b46e62c42ea43041628eb457bc2c08007 Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Mon, 18 Jul 2016 17:23:21 -0400 Subject: [PATCH 11/12] lm.stan: Reimplement log_besselI() again to reduce memory consumption --- .travis.yml | 3 -- R/stanmodels.R | 2 + exec/lm.stan | 100 ++++++++++++++++++++++++++++++++++++------------- 3 files changed, 75 insertions(+), 30 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5abe5a50e..3e26074de 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,9 +20,6 @@ before_install: - sed -i.bak "s/ g++/ ${CXX}${CLANG_EXTRA_ARG}/" ~/.R/Makevars - sed -i.bak "s/O[0-3]/O$CXX_OLEVEL/" ~/.R/Makevars -before_script: - - ulimit -v 7000000 - after_script: - tar -ztvf rstanarm_*.tar.gz - echo ${NOT_CRAN} diff --git a/R/stanmodels.R b/R/stanmodels.R index 4cd776954..4a1f1895b 100644 --- a/R/stanmodels.R +++ b/R/stanmodels.R @@ -28,7 +28,9 @@ stanmodels <- sapply(stan_files, function(f) { isystem <- file.path("inst", "chunks") if (!file.exists(file.path(isystem, "common_functions.stan"))) isystem <- file.path("..", "inst", "chunks") + sink(tempfile()) stanfit <- rstan::stanc_builder(f, isystem) + sink(NULL) stanfit$model_cpp <- list(model_cppname = stanfit$model_name, model_cppcode = stanfit$cppcode) return(do.call(methods::new, args = c(stanfit[-(1:3)], Class = "stanmodel", diff --git a/exec/lm.stan b/exec/lm.stan index b031d92fe..3831c17ef 100644 --- a/exec/lm.stan +++ b/exec/lm.stan @@ -25,7 +25,65 @@ functions { } /** - * Modded Bessel function of the first kind in log units + * Approximation of lgamma(z); see + * + * https://en.wikipedia.org/wiki/Stirling%27s_approximation#Versions_suitable_for_calculators + * + * @param z Positive scalar + */ + real Nemes(real z) + return 0.5 * log(6.283185307179586232 / z) + z * (log(z + 1 / (12 * z - 0.1 / z)) - 1); + + int signum(real x) + return x < 0 ? -1 : x > 0; + + /** + * Customized version of the bisection method to find when log-sum-exp can terminate; see + * + * https://en.wikipedia.org/wiki/Bisection_method#Algorithm + * + * This is only called from the log_besselI() function below + * + * @param v Order for log_besselI() function + * @param biggest Pivot in the log-sum-exp + * @param biggest_m Integer indicating what iteration reaches the pivot + * @param log_half_x Real number equal to log(0.5 * x) + */ + int bisection(real v, real biggest, int biggest_m, real log_half_x) { + int c; + int a; + int b; + int N; + real smallest; + real val_a; + a = biggest_m; + b = 2147483647; // maximum integer + N = 1; + smallest = -745.13321910194122211; + val_a = -smallest; + while (N <= 100) { + real val; + c = a / 2 + b / 2; + if (a == (b - 1)) return c; + else { + val = -2 * Nemes(c) + (2 * c + v) * log_half_x - biggest - smallest; + if (round(val) == 0) return c; + else if (signum(val) == signum(val_a)) { + a = c; + val_a = val; + } + else b = c; + N = N + 1; + } + } + return c; + } + + /** + * Modded Bessel function of the first kind in log units + * + * This implementation is designed to not overflow or eat too much RAM + * * @param x Real non-negative scalar * @param v Real non-negative order but may be integer */ @@ -40,11 +98,11 @@ functions { real biggest; real piece; real smallest; - real summand; int m; - int biggest_m; + int cutoff; if (x < 0) reject("x is assumed to be non-negative") if (v < 0) reject("v is assumed to be non-negative"); + if (x > 700) return x - 0.5 * log(2 * pi() * x); log_half_x = log(0.5 * x); lfac_0 = 0; lfac = lfac_0; @@ -55,9 +113,8 @@ functions { biggest = -lgam + lcons; piece = biggest; smallest = -745.13321910194122211; // exp(smallest) > 0 minimally - summand = 0.0; m = 1; - while (piece >= biggest) { // find maximum for log-sum-exp + while (piece >= biggest) { // find pivot for log-sum-exp biggest = piece; lfac = lfac + log(m); lgam = lgam + log(v + m); @@ -65,41 +122,30 @@ functions { piece = -lfac - lgam + lcons; m = m + 1; } - if (m == 2) summand = exp(piece - biggest); - else if (m > 1000) return x - 0.5 * log(2 * pi() * x); - else { // start over with interior biggest + // find next m such that exp(...) = 0 + cutoff = bisection(v, biggest, m - 1, log_half_x); + { // start over from m = 0 and fill up summands + vector[cutoff + 1] summands; lfac = lfac_0; lgam = lgam_0; lcons = lcons_0; piece = -lgam + lcons - biggest; - summand = exp(piece); - biggest_m = m - 2; + summands[1] = exp(piece); m = 1; - while (m < biggest_m) { + while (m <= cutoff) { lfac = lfac + log(m); lgam = lgam + log(v + m); lcons = lcons + 2 * log_half_x; piece = -lfac - lgam + lcons - biggest; - if (piece > smallest) summand = summand + exp(piece); m = m + 1; + summands[m] = exp(piece); } - // skip over biggest - lfac = lfac + log(m); - lgam = lgam + log(v + m); - lcons = lcons + 2 * log_half_x; - m = m + 1; - } - while (piece > smallest) { - lfac = lfac + log(m); - lgam = lgam + log(v + m); - lcons = lcons + 2 * log_half_x; - piece = -lfac - lgam + lcons - biggest; - summand = summand + exp(piece); - m = m + 1; + return biggest + log(sum(summands)); } - return biggest + log1p(summand); + reject ("should never reach here"); + return negative_infinity(); } - + /** * von Mises-Fisher distribution in log units * @param u J-array of unit vectors From ea566921712e56fa0300c2cc1a6d24d6009fa42c Mon Sep 17 00:00:00 2001 From: Ben Goodrich Date: Mon, 18 Jul 2016 21:30:58 -0400 Subject: [PATCH 12/12] stanmodels.R: really suppress parser warnings --- R/stanmodels.R | 6 +++--- exec/lm.stan | 6 ++---- vignettes/children/SETTINGS-rstan.txt | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/R/stanmodels.R b/R/stanmodels.R index 4a1f1895b..db20d9b67 100644 --- a/R/stanmodels.R +++ b/R/stanmodels.R @@ -28,9 +28,9 @@ stanmodels <- sapply(stan_files, function(f) { isystem <- file.path("inst", "chunks") if (!file.exists(file.path(isystem, "common_functions.stan"))) isystem <- file.path("..", "inst", "chunks") - sink(tempfile()) - stanfit <- rstan::stanc_builder(f, isystem) - sink(NULL) + tf <- tempfile() + capture.output(stanfit <- rstan::stanc_builder(f, isystem), + file = tf, type = "message") stanfit$model_cpp <- list(model_cppname = stanfit$model_name, model_cppcode = stanfit$cppcode) return(do.call(methods::new, args = c(stanfit[-(1:3)], Class = "stanmodel", diff --git a/exec/lm.stan b/exec/lm.stan index 3831c17ef..f59a7ced4 100644 --- a/exec/lm.stan +++ b/exec/lm.stan @@ -25,7 +25,7 @@ functions { } /** - * Approximation of lgamma(z); see + * Approximation of lgamma(z) for not-too-small z; see * * https://en.wikipedia.org/wiki/Stirling%27s_approximation#Versions_suitable_for_calculators * @@ -59,7 +59,7 @@ functions { a = biggest_m; b = 2147483647; // maximum integer N = 1; - smallest = -745.13321910194122211; + smallest = -745.13321910194122211; // exp(smallest) > 0 minimally val_a = -smallest; while (N <= 100) { real val; @@ -97,7 +97,6 @@ functions { real lcons_0; real biggest; real piece; - real smallest; int m; int cutoff; if (x < 0) reject("x is assumed to be non-negative") @@ -112,7 +111,6 @@ functions { lcons = lcons_0; biggest = -lgam + lcons; piece = biggest; - smallest = -745.13321910194122211; // exp(smallest) > 0 minimally m = 1; while (piece >= biggest) { // find pivot for log-sum-exp biggest = piece; diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt index de94c6b3b..2a450e1f5 100644 --- a/vignettes/children/SETTINGS-rstan.txt +++ b/vignettes/children/SETTINGS-rstan.txt @@ -1,7 +1,7 @@ ```{r, SETTINGS-rstan, include=FALSE} ITER <- 500L CHAINS <- 2L -CORES <- 1L +CORES <- 2L SEED <- 12345 set.seed(SEED) ```