From 48587275921671c4d296a778d967b12c88de299e Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Sat, 16 Jul 2016 15:22:04 -0400
Subject: [PATCH 01/12] introduce stan_lmList function

this is based on lme4::lmList but is restricted to linear models and does partial pooling rather than no pooling
---
 NAMESPACE                             |   1 +
 R/loo.R                               |  11 ++-
 R/misc.R                              |   7 ++
 R/posterior_predict.R                 |   9 ++-
 R/pp_data.R                           |  27 +++++--
 R/priors.R                            |   6 +-
 R/rstanarm-package.R                  |  17 ++--
 R/stan_biglm.R                        |  22 +++++-
 R/stan_biglm.fit.R                    |  89 +++++++++++++++------
 R/stan_lm.fit.R                       |   1 +
 R/stan_lmList.R                       | 107 ++++++++++++++++++++++++++
 R/stanreg-methods.R                   |  41 +++++++++-
 cleanup                               |   2 +-
 cleanup.win                           |   2 +-
 exec/lm.stan                          |  84 +++++++++++++++++++-
 man-roxygen/args-kappa_mean.R         |   6 ++
 tests/testthat/test_stan_functions.R  |  43 +++++++++++
 tests/testthat/test_stan_lm.R         |   5 ++
 vignettes/children/SETTINGS-rstan.txt |   1 +
 vignettes/lm.Rmd                      | 105 +++++++++++++++++++++----
 20 files changed, 518 insertions(+), 68 deletions(-)
 create mode 100644 R/stan_lmList.R
 create mode 100644 man-roxygen/args-kappa_mean.R

diff --git a/NAMESPACE b/NAMESPACE
index 9b471cc6a..ecfd4b26c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -76,6 +76,7 @@ export(stan_glmer.nb)
 export(stan_lm)
 export(stan_lm.fit)
 export(stan_lm.wfit)
+export(stan_lmList)
 export(stan_lmer)
 export(stan_polr)
 export(stan_polr.fit)
diff --git a/R/loo.R b/R/loo.R
index 72783b3e5..38ca32b8e 100644
--- a/R/loo.R
+++ b/R/loo.R
@@ -362,7 +362,7 @@ ll_fun <- function(x) {
   f <- family(x)
   if (!is(f, "family") || is_scobit(x))
     return(.ll_polr_i)
-  
+  if (is.lmList(x)) return(.ll_gaussian_lmList_i)
   get(paste0(".ll_", f$family, "_i"))
 }
 
@@ -390,7 +390,7 @@ ll_args <- function(object, newdata = NULL) {
   if (is(f, "family") && !is_scobit(object)) {
     fname <- f$family
     if (!is.binomial(fname)) {
-      data <- data.frame(y, x)
+      data <- data.frame(y, as.matrix(x))
     } else {
       if (NCOL(y) == 2L) {
         trials <- rowSums(y)
@@ -405,7 +405,9 @@ ll_args <- function(object, newdata = NULL) {
     }
     draws$beta <- stanmat[, seq_len(ncol(x)), drop = FALSE]
     if (is.gaussian(fname)) 
-      draws$sigma <- stanmat[, "sigma"]
+      draws$sigma <- stanmat[, grep("^sigma", colnames(stanmat))]
+    if (is.lmList(object))
+      draws$groups <- as.integer(object$groups)
     if (is.gamma(fname)) 
       draws$shape <- stanmat[, "shape"]
     if (is.ig(fname)) 
@@ -492,6 +494,9 @@ ll_args <- function(object, newdata = NULL) {
   val <- dnorm(data$y, mean = .mu(data,draws), sd = draws$sigma, log = TRUE)
   .weighted(val, data$weights)
 }
+.ll_gaussian_lmList_i <- function(i, data, draws) {
+  val <- dnorm(data$y, mean = .mu(data,draws), sd = draws$sigma[,draws$group[i]], log = TRUE)
+}
 .ll_binomial_i <- function(i, data, draws) {
   val <- dbinom(data$y, size = data$trials, prob = .mu(data,draws), log = TRUE)
   .weighted(val, data$weights)
diff --git a/R/misc.R b/R/misc.R
index bfb10d332..6e2bf6ff1 100644
--- a/R/misc.R
+++ b/R/misc.R
@@ -138,6 +138,13 @@ is.mer <- function(x) {
   isTRUE(check1 && check2)
 }
 
+# Test if stanreg object used stan_lmList
+#
+# @param x A stanreg object.
+is.lmList <- function(x) {
+  inherits(x, "lmList")
+}
+
 # Consistent error message to use when something is only available for 
 # models fit using MCMC
 #
diff --git a/R/posterior_predict.R b/R/posterior_predict.R
index 81446d0cb..b66a46cf6 100644
--- a/R/posterior_predict.R
+++ b/R/posterior_predict.R
@@ -159,6 +159,7 @@ posterior_predict <- function(object, newdata = NULL, draws = NULL,
   else ppargs <- pp_args(object, data = pp_eta(object, dat, draws))
   if (!is(object, "polr") && is.binomial(family(object)$family))
     ppargs$trials <- pp_binomial_trials(object, newdata)
+  if (inherits(object, "lmList")) ppargs$group <- object$group
   
   ppfun <- pp_fun(object)
   ytilde <- do.call(ppfun, ppargs)
@@ -175,6 +176,7 @@ posterior_predict <- function(object, newdata = NULL, draws = NULL,
 # functions to draw from the various posterior predictive distributions
 pp_fun <- function(object) {
   suffix <- if (is(object, "polr")) "polr" else family(object)$family
+  if (inherits(object, "lmList")) suffix <- paste0(suffix, "_grouped")
   get(paste0(".pp_", suffix), mode = "function")
 }
 
@@ -183,6 +185,11 @@ pp_fun <- function(object) {
     rnorm(ncol(mu), mu[s,], sigma[s])
   }))
 }
+.pp_gaussian_grouped <- function(mu, sigma, group) {
+  t(sapply(1:nrow(mu), function(s) {
+    rnorm(ncol(mu), mu[s,], sigma[s,group])
+  }))
+}
 .pp_binomial <- function(mu, trials) {
   t(sapply(1:nrow(mu), function(s) {
     rbinom(ncol(mu), size = trials, prob = mu[s, ])
@@ -260,7 +267,7 @@ pp_args <- function(object, data) {
   args <- list(mu = inverse_link(eta))
   famname <- family(object)$family
   if (is.gaussian(famname)) {
-    args$sigma <- stanmat[, "sigma"]
+    args$sigma <- stanmat[, grep("^sigma", colnames(stanmat))]
   } else if (is.gamma(famname)) {
     args$shape <- stanmat[, "shape"]
   } else if (is.ig(famname)) {
diff --git a/R/pp_data.R b/R/pp_data.R
index 7c80cf6ea..e70ca89db 100644
--- a/R/pp_data.R
+++ b/R/pp_data.R
@@ -31,13 +31,26 @@ pp_data <- function(object, newdata = NULL, re.form = NULL, ...) {
     return(nlist(x, offset))
   }
   tt <- terms(object)
-  Terms <- delete.response(tt)
-  m <- model.frame(Terms, newdata, xlev = object$xlevels)
-  if (!is.null(cl <- attr(Terms, "dataClasses"))) 
-    .checkMFClasses(cl, m)
-  x <- model.matrix(Terms, m, contrasts.arg = object$contrasts)
-  if (is(object, "polr") && !is_scobit(object)) 
-    x <- x[,colnames(x) != "(Intercept)", drop = FALSE]
+  if (inherits(object, "lmList")) {
+    f <- as.character(object$formula)
+    f <- as.formula(paste(f[2], "~", "-1 + (", f[3], ")"))
+    g <- glFormula(f, newdata)
+    modelframe <- g$fr
+    x <- t(g$reTrms$Zt)
+    group_names <- levels(object$groups)
+    K <- NCOL(x) / length(group_names)
+    colnames(x) <- c(sapply(group_names, FUN = function(g) paste0(1:K, ":", g)))
+    x <- x[,sort(colnames(x))]
+  }
+  else {
+    Terms <- delete.response(tt)
+    m <- model.frame(Terms, newdata, xlev = object$xlevels)
+    if (!is.null(cl <- attr(Terms, "dataClasses"))) 
+      .checkMFClasses(cl, m)
+    x <- model.matrix(Terms, m, contrasts.arg = object$contrasts)
+    if (is(object, "polr") && !is_scobit(object)) 
+      x <- x[,colnames(x) != "(Intercept)", drop = FALSE]
+  }
   offset <- rep(0, nrow(x))
   if (!is.null(off.num <- attr(tt, "offset"))) {
     for (i in off.num) {
diff --git a/R/priors.R b/R/priors.R
index edc5cc142..def4d49c0 100644
--- a/R/priors.R
+++ b/R/priors.R
@@ -400,12 +400,14 @@ make_eta <- function(location, what = c("mode", "mean", "median", "log"), K) {
   half_K <- K / 2
   if (what == "mode") {
     stopifnot(location > 0, location <= 1)
-    if (K <= 2)
-      stop(paste("R2 prior error.", 
+    if (K <= 2) {
+      if (location == 0.5 && K == 2) what <- "mean"
+      else stop(paste("R2 prior error.", 
                  "The mode of the beta distribution does not exist",
                  "with fewer than three predictors.", 
                  "Specify 'what' as 'mean', 'median', or 'log' instead."),
            call. = FALSE)
+    }
     eta <- (half_K - 1  - location * half_K + location * 2) / location
   } else if (what == "mean") {
     stopifnot(location > 0, location < 1)
diff --git a/R/rstanarm-package.R b/R/rstanarm-package.R
index 9456f9cc5..a4e380e57 100644
--- a/R/rstanarm-package.R
+++ b/R/rstanarm-package.R
@@ -116,11 +116,14 @@
 #' overview:
 #' 
 #' \describe{
-#'  \item{\code{\link{stan_lm}}, \code{stan_aov}, \code{stan_biglm}}{
-#'   Similar to \code{\link[stats]{lm}} or \code{\link[stats]{aov}} but with 
-#'   novel regularizing priors on the model parameters that are driven by prior 
-#'   beliefs about \eqn{R^2}, the proportion of variance in the outcome 
-#'   attributable to the predictors in a linear model.
+#'  \item{\code{\link{stan_lm}}, \code{stan_aov}, \code{stan_biglm}, \code{stan_lmList}}{
+#'   Similar to \code{\link[stats]{lm}} but with novel regularizing priors on the model 
+#'   parameters that are driven by prior beliefs about \eqn{R^2}, the proportion of 
+#'   variance in the outcome attributable to the predictors in a linear model. The
+#'   \code{stan_aov} variant estimates ANOVA models, the \code{stan_biglm} variant
+#'   estimates linear models when the design matrix is too large to fit into memory, and
+#'   the \code{stan_lmList} variant allows all the parameters to vary by group but utilizes
+#'   hyperpriors to shrink them toward common parameters, much like \code{stan_lmer}.
 #'  }
 #'  \item{\code{\link{stan_glm}}, \code{stan_glm.nb}}{
 #'   Similar to \code{\link[stats]{glm}} but with Gaussian, Student t, Cauchy 
@@ -139,7 +142,9 @@
 #'   a highly-structured but unknown covariance matrix (for which \pkg{rstanarm}
 #'   introduces an innovative prior distribution). MCMC provides more
 #'   appropriate estimates of uncertainty for models that consist of a mix of
-#'   common and group-specific parameters.
+#'   common and group-specific parameters. The same line or argument applies even
+#'   more in the case of \code{stan_lmList} mentioned in the first stanza of this
+#'   section.
 #'  }
 #'  \item{\code{\link{stan_gamm4}}}{
 #'   Similar to \code{\link[gamm4]{gamm4}} in the \pkg{gamm4} package, which 
diff --git a/R/stan_biglm.R b/R/stan_biglm.R
index 9b9f9547a..0b5a2dd66 100644
--- a/R/stan_biglm.R
+++ b/R/stan_biglm.R
@@ -27,11 +27,17 @@
 #'   intercept and must utilize \emph{centered} but not \emph{standardized}
 #'   predictors. See the Details section or the Example.
 #' @param xbar A numeric vector of means in the implicit design matrix for
-#'   the observations included in the model
+#'   the observations included in the model or --- in the case of 
+#'   \code{stan_biglm.fit} only --- a list of such vectors with one list element
+#'   for each group
 #' @param ybar A numeric scalar indicating the same mean of the outcome for
-#'   the observations included in the model
+#'   the observations included in the model or --- in the case of 
+#'   \code{stan_biglm.fit} only --- a numeric vector of such means with one
+#'   element for each group
 #' @param s_y A numeric scalar indicating the unbiased sample standard deviation
-#'   of the outcome for the observations included in the model
+#'   of the outcome for the observations included in the model or --- in the case 
+#'   of \code{stan_biglm.fit} only --- a numeric vector of such standard deviations
+#'   with one element for each group
 #' @param has_intercept A logical scalar indicating whether to add an intercept
 #'   to the model when estimating it
 #' @template args-dots
@@ -69,6 +75,14 @@
 #'   The sample mean and sample standard deviation of the outcome must also
 #'   be passed.
 #'   
+#'   The \code{stan_biglm} function calls \code{stan_biglm.fit}, although the
+#'   latter can be called directly. The first seven arguments to the
+#'   \code{stan_biglm.fit} may be lists of the same length where the list length
+#'   is equal to the number of mutually exclusive and exhaustive groups that
+#'   the data have been stratified by. The \code{\link{stan_lmList}} function
+#'   provides a more conventional interface for a stratified linear regression
+#'   model and calls \code{stan_biglm.fit} internally.
+#'   
 #' @return The output of both \code{stan_biglm} and \code{stan_biglm.fit} is an object of
 #'   \code{\link[rstan]{stanfit-class}} rather than \code{\link{stanreg-objects}}, 
 #'   which is more limited and less convenient but necessitated by the fact that 
@@ -98,7 +112,7 @@ stan_biglm <- function(biglm, xbar, ybar, s_y, has_intercept = TRUE, ...,
   R <- sqrt(biglm$qr$D) * R
   return(stan_biglm.fit(b, R, SSR = biglm$qr$ss, N = biglm$n, xbar, ybar, s_y, has_intercept, 
                         ...,
-                        prior = prior, prior_intercept = prior_intercept,
+                        prior = prior, prior_intercept = prior_intercept, kappa_mean = 0,
                         prior_PD = prior_PD, algorithm = algorithm, 
                         adapt_delta = adapt_delta))
 }
diff --git a/R/stan_biglm.fit.R b/R/stan_biglm.fit.R
index 655e42bb2..bfe327c5e 100644
--- a/R/stan_biglm.fit.R
+++ b/R/stan_biglm.fit.R
@@ -17,10 +17,17 @@
 
 #' @rdname stan_biglm
 #' @export
-#' @param b A numeric vector of OLS coefficients, excluding the intercept
-#' @param R A square upper-triangular matrix from the QR decomposition of the design matrix
+#' @param b A numeric vector of OLS coefficients --- excluding the intercept --- 
+#'   or a list of such vectors with one list element for each group
+#' @param R A square upper-triangular matrix from the QR decomposition of the 
+#'   centered design matrix or a list of such matrices with one list element for
+#'   each group
 #' @param SSR A numeric scalar indicating the sum-of-squared residuals for OLS
-#' @param N A integer scalar indicating the number of included observations
+#'   or a numeric vector of sums-of-squared residuals with one element for each
+#'   group
+#' @param N A integer scalar indicating the number of included observations or
+#'   an integer array of sample sizes with one element for each group
+#' @template args-kappa_mean   
 #' @examples
 #' # create inputs
 #' ols <- lm(mpg ~ wt + qsec + am - 1, # next line is critical for centering
@@ -39,23 +46,41 @@
 #' cbind(lm = b, stan_lm = rstan::get_posterior_mean(post)[14:16]) # shrunk
 stan_biglm.fit <- function(b, R, SSR, N, xbar, ybar, s_y, has_intercept = TRUE, ...,
                            prior = R2(stop("'location' must be specified")), 
-                           prior_intercept = NULL, prior_PD = FALSE, 
+                           prior_intercept = NULL, kappa_mean = 1, prior_PD = FALSE,
                            algorithm = c("sampling", "meanfield", "fullrank"),
                            adapt_delta = NULL) {
-  
-  J <- 1L
-  N <- array(N, c(J))
-  K <- ncol(R)
-  cn <- names(xbar)
-  if (is.null(cn)) cn <- names(b)
-  R_inv <- backsolve(R, diag(K))
-  JK <- c(J, K)
-  xbarR_inv <- array(c(xbar %*% R_inv), JK)
-  Rb <- array(R %*% b, JK)
-  SSR <- array(SSR, J)
-  s_Y <- array(s_y, J)
-  center_y <- if (isTRUE(all.equal(matrix(0, J, K), xbar))) ybar else 0.0
-  ybar <- array(ybar, J)
+  if (is.list(b)) {
+    J <- length(b)
+    K <- ncol(R[[1]])
+    cn <- names(xbar[[1]])
+    if (is.null(cn)) cn <- names(b[[1]])
+    I <- diag(K)
+    R_inv <- sapply(R, simplify = FALSE, FUN = backsolve, x = I)
+    center_y <- 0.0
+    xbarR_inv <- Rb <- vector("list", J)
+    names(xbarR_inv) <- names(Rb) <- names(b)
+    for (j in 1:J) {
+      xbarR_inv[[j]] <- c(xbar[[j]] %*% R_inv[[j]])
+      Rb[[j]] <- c(R[[j]] %*% b[[j]])
+    }
+  }
+  else {
+    J <- 1L
+    N <- array(N, c(J))
+    K <- ncol(R)
+    cn <- names(xbar)
+    if (is.null(cn)) cn <- names(b)
+    I <- diag(K)
+    R_inv <- backsolve(R, I)
+    JK <- c(J, K)
+    xbarR_inv <- array(c(xbar %*% R_inv), JK)
+    Rb <- array(R %*% b, JK)
+    SSR <- array(SSR, J)
+    s_y <- array(s_y, J)
+    center_y <- if (isTRUE(all.equal(matrix(0, J, K), xbar))) ybar else 0.0
+    ybar <- array(ybar, J)
+    dim(R_inv) <- c(J, dim(R_inv))
+  }
   
   if (!length(prior)) {
     prior_dist <- 0L
@@ -77,23 +102,28 @@ stan_biglm.fit <- function(b, R, SSR, N, xbar, ybar, s_y, has_intercept = TRUE,
     if (is.null(prior_scale_for_intercept))
       prior_scale_for_intercept <- 0
   }
-  dim(R_inv) <- c(J, dim(R_inv))
   
   # initial values
-  R2 <- array(1 - SSR[1] / ((N - 1) * s_Y^2), J)
+  if (J == 1) R2 <- array(1 - SSR[1] / ((N - 1) * s_y^2), J)
+  else R2 <- 1 - SSR / ((N - 1) * s_y^2)
   log_omega <- array(0, ifelse(prior_PD == 0, J, 0))
   init_fun <- function(chain_id) {
     out <- list(R2 = R2, log_omega = log_omega)
     if (has_intercept == 0L) out$z_alpha <- double()
+    if (J == 1) {
+      out$SMC <- double()
+      out$mu <- array(0, c(0, K))
+      out$kappa <- double()
+    }
     return(out)
   }
   stanfit <- stanmodels$lm
   standata <- nlist(K, has_intercept, prior_dist,
                     prior_dist_for_intercept, 
                     prior_mean_for_intercept,
-                    prior_scale_for_intercept,
+                    prior_scale_for_intercept, kappa_mean,
                     prior_PD, eta, J, N, xbarR_inv,
-                    ybar, center_y, s_Y, Rb, SSR, R_inv)
+                    ybar, center_y, s_Y = s_y, Rb, SSR, R_inv)
   pars <- c(if (has_intercept) "alpha", "beta", "sigma", 
             if (prior_PD == 0) "log_omega", "R2", "mean_PPD")
   algorithm <- match.arg(algorithm)
@@ -109,9 +139,18 @@ stan_biglm.fit <- function(b, R, SSR, N, xbar, ybar, s_y, has_intercept = TRUE,
       init = init_fun, data = standata, pars = pars, show_messages = FALSE)
     stanfit <- do.call(sampling, sampling_args)
   }
-  new_names <- c(if (has_intercept) "(Intercept)", cn, "sigma", 
-                 if (prior_PD == 0) "log-fit_ratio", 
-                 "R2", "mean_PPD", "log-posterior")
+  if (J == 1) new_names <- c(if (has_intercept) "(Intercept)", cn, "sigma", 
+                             if (prior_PD == 0) "log-fit_ratio", 
+                             "R2", "mean_PPD", "log-posterior")
+  else {
+    group_names <- names(b)
+    new_names <- c(if (has_intercept) paste0("(Intercept):", group_names),
+                   t(sapply(group_names, FUN = function(g) paste0(cn, ":", g))),
+                   paste0("sigma:", group_names),
+                   if (prior_PD == 0) paste0("log-fit_ratio:", group_names),
+                   paste0("R2:", group_names), paste0("mean_PPD:", group_names),
+                   "log-posterior")
+  }
   stanfit@sim$fnames_oi <- new_names
   return(stanfit)
 }
diff --git a/R/stan_lm.fit.R b/R/stan_lm.fit.R
index 6eaff7df4..366071f14 100644
--- a/R/stan_lm.fit.R
+++ b/R/stan_lm.fit.R
@@ -62,6 +62,7 @@ stan_lm.wfit <- function(x, y, w, offset = NULL, singular.ok = TRUE, ...,
                         N = nrow(x), xbar = xbar, ybar = ybar, s_y = sd(y),
                         has_intercept = has_intercept, ...,
                         prior = prior, prior_intercept = prior_intercept,
+                        kappa_mean = 0,
                         prior_PD = prior_PD, algorithm = algorithm, 
                         adapt_delta = adapt_delta))
   
diff --git a/R/stan_lmList.R b/R/stan_lmList.R
new file mode 100644
index 000000000..49176f0a1
--- /dev/null
+++ b/R/stan_lmList.R
@@ -0,0 +1,107 @@
+# Part of the rstanarm package for estimating model parameters
+# Copyright (C) 2016 Trustees of Columbia University
+# 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 3
+# of the License, or (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+#' Bayesian regularized linear but stratified models via Stan
+#' 
+#' This model uses the same likelihood and syntax as the  \code{\link[lme4]{lmList}} 
+#' function in the \pkg{lme4} package, which allows the parameters of a linear
+#' regression model to vary according to a mutually exclusive and exhaustive grouping
+#' factor, but differs fundamentally in that group-level parameters have hyperpriors
+#' that shrink --- to some estimated extent --- the group-level parameters toward
+#' common parameters. Thus, \code{stan_lmList} is more like a \code{\link[lme4]{lmer}}
+#' model with a sinking grouping variable that pertains too all the regression
+#' coefficients and the intercept (and, unlike \code{\link[lme4]{lmer}}, the
+#' standard deviation of the errors).
+#' 
+#' @export
+#' @param formula,data,family Same as in \code{\link[lme4]{lmList}} but 
+#'   \code{family} must be omitted or \code{\link[stats]{gaussian}}
+#' @param subset,weights,na.action,offset Same as in \code{\link[lme4]{lmList}}
+#'   but rarely specified
+#' @template args-dots
+#' @param prior,prior_intercept Same as in \code{\link{stan_lm}}
+#' @template args-kappa_mean
+#' @template args-prior_PD
+#' @template args-algorithm
+#' @template args-adapt_delta
+#' 
+#' @examples 
+#' \dontrun{
+#' post <- stan_lmList(mpg ~ disp + wt + hp | cyl, data = mtcars, 
+#'                     prior = R2(0.25))
+#' coef(post) # posterior medians stratified by number of cylinders
+#' }
+stan_lmList <- function(formula, data, family, subset, weights, na.action, offset, 
+                        ..., prior = R2(stop("'location' must be specified")), 
+                        prior_intercept = NULL, kappa_mean = 1, prior_PD = FALSE, 
+                        algorithm = c("sampling", "meanfield", "fullrank"),
+                        adapt_delta = NULL) {
+  f <- as.character(formula)
+  f <- as.formula(paste(f[2], "~", "-1 + (", f[3], ")"))
+  isGLM <- !missing(family)
+  if (isGLM) stop("GLMs are not currently supported so omit the 'family' argument")
+  mCall <- match.call(expand.dots = FALSE)
+  mCall$prior <- mCall$prior_intercept <- mCall$prior_PD <- mCall$algorithm <- 
+    mCall$adapt_delta <- mCall$`...` <- NULL
+  mCall$pool <- FALSE
+  mCall$y <- TRUE
+  mCall[[1L]] <- quote(lme4::lmList)
+  lms <- eval.parent(mCall)
+  group_names <- levels(lms@groups)
+  has_intercept <- identical(attr(lms@.Data[[1L]]$terms, "intercept"), 1L)
+  if (has_intercept) {
+    b <- lapply(lms@.Data, FUN = function(ols) coef(ols)[-1L])
+    X <- lapply(lms@.Data, FUN = function(ols) model.matrix(ols)[,-1L, drop = FALSE])
+  }
+  else {
+    b <- lapply(lms@.Data, FUN = coef)
+    X <- lapply(lms@.Data, FUN = model.matrix)
+  }
+  names(b) <- names(X) <- group_names
+  R <- sapply(X, simplify = FALSE, FUN = function(x) {
+    qr.R(qr(sweep(x, MARGIN = 2, STATS = colMeans(x))))
+  })
+  SSR <- sapply(lms@.Data, FUN = function(ols) crossprod(residuals(ols))[1])
+  names(SSR) <- group_names
+  N <- sapply(X, FUN = NROW)
+  xbar <- sapply(X, simplify = FALSE, FUN = colMeans)
+  ybar <- sapply(lms@.Data, FUN = function(ols) mean(ols$y))
+  s_y  <- sapply(lms@.Data, FUN = function(ols)   sd(ols$y))
+  names(ybar) <- names(s_y) <- group_names
+  algorithm <- match.arg(algorithm)
+  stanfit <- stan_biglm.fit(b, R, SSR, N, xbar, ybar, s_y, has_intercept, ..., 
+                            prior = prior, prior_intercept = prior_intercept,
+                            kappa_mean = kappa_mean, prior_PD = prior_PD, 
+                            algorithm = algorithm, adapt_delta = adapt_delta)
+  g <- glFormula(f, data)
+  modelframe <- g$fr
+  Z <- t(g$reTrms$Zt)
+  K <- NCOL(X[[1]]) + has_intercept
+  colnames(Z) <- c(sapply(group_names, FUN = function(g) paste0(1:K, ":", g)))
+  Z <- Z[,sort(colnames(Z))]
+  fit <- nlist(stanfit, family = gaussian(), formula, offset = NULL, weights = NULL,
+               x = Z, y = modelframe[,1], data = if (missing("data")) environment(formula) else data,
+               prior.info = prior, 
+               algorithm, call = match.call(), terms = attr(g$fr, "terms"),
+               model = NULL,
+               na.action = attr(modelframe, "na.action"),
+               contrasts = attr(modelframe, "contrasts"))
+  out <- stanreg(fit)
+  out$groups <- lms@groups
+  class(out) <- c("stanreg", "lmList")
+  return(out)
+}
diff --git a/R/stanreg-methods.R b/R/stanreg-methods.R
index 577407ca8..298e89b83 100644
--- a/R/stanreg-methods.R
+++ b/R/stanreg-methods.R
@@ -85,7 +85,16 @@ NULL
 coef.stanreg <- function(object, ...) {
   if (is.mer(object)) 
     return(coef_mer(object, ...))
-  
+  if (is.lmList(object)) {
+    J <- nlevels(object$groups)
+    beta <- object$coefficients
+    mark <- grep("(Intercept)", names(beta), fixed = TRUE)
+    alpha <- beta[mark]
+    mat <- cbind(alpha, matrix(beta[-mark], nrow = J, byrow = TRUE))
+    rownames(mat) <- levels(object$groups)
+    colnames(mat) <- unique(gsub(":.*$", "", names(beta)))
+    return(mat)
+  }
   object$coefficients
 }
 
@@ -272,8 +281,17 @@ coef_mer <- function(object, ...) {
 #' @export
 #' @export fixef
 #' @importFrom lme4 fixef
-#' 
 fixef.stanreg <- function(object, ...) {
+  if (is.lmList(object)) {
+    means <- get_posterior_mean(object$stanfit)
+    means <- means[,ncol(means)]
+    nms <- names(means)
+    J <- nlevels(object$groups)
+    FE <- colMeans(cbind(means[grep("^alpha", nms)],
+                   matrix(means[grep("^beta", nms)], nrow = J, byrow = TRUE)))
+    names(FE) <- colnames(coef(object))
+    return(FE)
+  }
   coefs <- object$coefficients
   coefs[b_names(names(coefs), invert = TRUE)]
 }
@@ -293,6 +311,20 @@ ngrps.stanreg <- function(object, ...) {
 #' @importFrom lme4 ranef
 #' 
 ranef.stanreg <- function(object, ...) {
+  if (is.lmList(object)) {
+    means <- get_posterior_mean(object$stanfit)
+    means <- means[,ncol(means)]
+    nms <- names(means)
+    J <- nlevels(object$groups)
+    effs <- cbind(means[grep("^alpha", nms)],
+                  matrix(means[grep("^beta", nms)], nrow = J, byrow = TRUE))
+    colnames(effs) <- colnames(coef(object))
+    effs <- as.data.frame(sweep(effs, MARGIN = 2, STATS = colMeans(effs)))
+    attr(effs, "label") <- "Random effects"
+    attr(effs, "standardized") <- FALSE
+    class(effs) <- "ranef.lmList"
+    return(effs)
+  }
   all_names <- if (used.optimizing(object))
     rownames(object$stan_summary) else object$stanfit@sim$fnames_oi
   sel <- b_names(all_names)
@@ -327,6 +359,11 @@ ranef.stanreg <- function(object, ...) {
 #'   importFrom(lme4,sigma)
 #'
 sigma.stanreg <- function(object, ...) {
+  if (is.lmList(object)) {
+    message("this could take a little while to simulate")
+    return(sqrt(mean( (posterior_predict(object) - 
+                       as.matrix(posterior_linpred(object)))^2 )))
+  }
   if (!("sigma" %in% rownames(object$stan_summary))) 
     return(1)
   
diff --git a/cleanup b/cleanup
index bf4430d39..e94582df8 100755
--- a/cleanup
+++ b/cleanup
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/sh -e
 
 # Note to Windows users: This is not actually platform specific.
 mkdir -p src/include
diff --git a/cleanup.win b/cleanup.win
index bf4430d39..e94582df8 100755
--- a/cleanup.win
+++ b/cleanup.win
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/sh -e
 
 # Note to Windows users: This is not actually platform specific.
 mkdir -p src/include
diff --git a/exec/lm.stan b/exec/lm.stan
index 1868363ab..59da1d8c2 100644
--- a/exec/lm.stan
+++ b/exec/lm.stan
@@ -23,6 +23,72 @@ functions {
       N * (log(sigma) + 0.91893853320467267);
     return target();
   }
+
+  /**
+   * Modded Bessel function of the first kind in log units 
+   * @param x Real non-negative scalar
+   * @param v Real non-negative order but may be integer
+   */
+  real log_besselI(real x, real v) {
+    real log_half_x;
+    real lfac;
+    real lgam;
+    real lcons;
+    real biggest;
+    real piece;
+    real smallest;
+    real summand;
+    int m;
+    if (x < 0) reject("x is assumed to be non-negative")
+    if (v < 0) reject("v is assumed to be non-negative");
+    log_half_x = log(0.5 * x);
+    lfac = 0;
+    lgam = lgamma(v + 1);
+    lcons = v * log_half_x;
+    biggest = -lgam + lcons;
+    piece = positive_infinity();
+    smallest = -745.13321910194122211; // exp(smallest) > 0 minimally
+    summand = 0.0;
+    m = 1;
+    while (piece > smallest) {
+      lfac = lfac + log(m);
+      lgam = lgam + log(v + m);
+      lcons = lcons + 2 * log_half_x;
+      piece = -lfac - lgam + lcons - biggest;
+      summand = summand + exp(piece);
+      m = m + 1;
+    }
+    return biggest + log1p(summand);
+  }
+
+  /**
+   * von Mises-Fisher distribution in log units 
+   * @param u J-array of unit vectors
+   * @param mu Unit vector that is the expectation for u
+   * @param kappa Non-negative concentration parameter
+   */
+  real VMF(vector[] u, vector mu, real kappa) {
+    int p;
+    real half_p;
+    real half_pm1;
+    int J;
+    real out;
+    p = rows(mu);
+    if (p < 2) reject("p must be >= 2");
+    half_p = 0.5 * p;
+    J = size(u);
+    if (kappa == 0) return -J * half_p * 1.8378770664093453391; 
+    half_pm1 = half_p - 1;
+    out = 0;
+    for (j in 1:J) out = out + dot_product(mu, u[j]);
+    out = out * kappa;
+    if (p == 2) 
+      return J * (-1.8378770664093453391 - log(modified_bessel_first_kind(0, kappa))) + out;
+    if (p == 3) 
+      return J * (log(kappa) - log(12.566370614359172464 * sinh(kappa))) + out;
+    return J * ( half_pm1 * log(kappa) - half_p * 1.8378770664093453391 
+                  - log_besselI(kappa, half_pm1) ) + out;
+  }
 }
 data {
   int<lower=0,upper=1> has_intercept; // 0 = no, 1 = yes
@@ -30,6 +96,7 @@ data {
   real<lower=0> prior_scale_for_intercept;       // 0 = by CLT
   real prior_mean_for_intercept;      // expected value for alpha
   int<lower=0,upper=1> prior_dist;    // 0 = uniform for R^2, 1 = Beta(K/2,eta)
+  real<lower=0> kappa_mean;           // prior expectation of concentration parameter
   int<lower=0,upper=1> prior_PD;      // 0 = no, 1 = yes to drawing from the prior
   real<lower=0> eta;                  // shape hyperparameter
   
@@ -59,6 +126,9 @@ parameters { // must not call with init="0"
   unit_vector[K] u[J];                  // primitives for coefficients
   real z_alpha[J * has_intercept];      // primitives for intercepts
   real<lower=0,upper=1> R2[J];          // proportions of variance explained
+  real<lower=0,upper=1> SMC[J > 1];     // proportion  of variance explained overall
+  unit_vector[K] mu[J > 1];             // global location parameter
+  real<lower=0> kappa[J > 1];           // unscaled concentration parameter
   vector[J * (1 - prior_PD)] log_omega; // under/overfitting factors
 }
 transformed parameters {
@@ -97,12 +167,22 @@ model {
                                has_intercept == 1 ? alpha[j] + shift : shift,
                                ybar[j], SSR[j], sigma[j], N[j]);
     }
-    // implicit: u[j] is uniform on the surface of a hypersphere
   }
   if (has_intercept == 1 && prior_dist_for_intercept > 0) 
     target += normal_lpdf(z_alpha | 0, 1);
-  if (prior_dist == 1) target += beta_lpdf(R2 | half_K, eta);
+  if (J == 1) {
+    if (prior_dist == 1) target += beta_lpdf(R2 | half_K, eta);
+    // implicit: u[1] is uniform on the surface of a hypersphere
+  }
+  else {
+    target += beta_lpdf(R2 | half_K, half_K * (inv(SMC[1]) - 1));
+    if (prior_dist == 1) target += beta_lpdf(SMC[1] | half_K, eta);
+    target += VMF(u, mu[1], kappa_mean * kappa[1]);
+    // implicit: mu is uniform on the surface of a hyperspere
+    target += exponential_lpdf(kappa | 1);
+  }
   // implicit: log_omega is uniform over the real line for all j
+  
 }
 generated quantities {
   real mean_PPD[J];
diff --git a/man-roxygen/args-kappa_mean.R b/man-roxygen/args-kappa_mean.R
new file mode 100644
index 000000000..b9ed69dbd
--- /dev/null
+++ b/man-roxygen/args-kappa_mean.R
@@ -0,0 +1,6 @@
+#' @param kappa_mean A positive scalar indicating the expectation of the concentration
+#'   parameter in a von Mises-Fisher distribution where the random variable in question
+#'   is each group's unit vector underlying its regression coefficients. See the vignette
+#'   for \code{\link{stan_lm}} for more details. Defaults to \eqn{1} but is ignored if
+#'   there is only one group. A higher value produces more shrinkage toward the common unit
+#'   vector while a lower value makes the groups closer to independent.
diff --git a/tests/testthat/test_stan_functions.R b/tests/testthat/test_stan_functions.R
index f59493bd0..b0d157fda 100644
--- a/tests/testthat/test_stan_functions.R
+++ b/tests/testthat/test_stan_functions.R
@@ -296,6 +296,49 @@ test_that("ll_mvn_ols_qr_lp returns expected results", {
                                              SSR, sigma, N)))
 })
 
+context("lm")
+test_that("log_besselI returns expected results", {
+  x <- exp(1)
+  v <- 10 * pi
+  expect_equal(log(besselI(x, v)), log_besselI(x, v))
+})
+
+context("lm")
+test_that("VMF returns expected results", {
+  dVMF <- function(u, mu, kappa) {
+    p <- length(mu)
+    if (is.list(u)) J <- length(u)
+    else J <- 1
+    log_C <- (0.5 * p - 1) * log(kappa) - 0.5 * p * log(2 * pi) - log(besselI(kappa, 0.5 * p - 1))
+    if (J > 1) return(log_C * J + kappa * sum(sapply(u, `%*%`, x = mu)))
+    else return(log_C + kappa * c(mu %*% u))
+  }
+  kappa <- rexp(1)
+  mu <- rnorm(4)
+  mu <- mu / sqrt(crossprod(mu)[1])
+  J <- 3
+  u <- lapply(1:J, FUN = function(j) {
+    u <- rnorm(length(mu))
+    u <- u / sqrt(crossprod(u)[1])
+    return(u)
+  })
+  expect_equal(dVMF(u, mu, kappa), VMF(u, mu, kappa))
+  mu <- rnorm(3)
+  mu <- mu / sqrt(crossprod(mu)[1])
+  J <- 4
+  u <- lapply(1:J, FUN = function(j) {
+    u <- rnorm(length(mu))
+    u <- u / sqrt(crossprod(u)[1])
+    return(u)
+  })
+  expect_equal(dVMF(u, mu, kappa), VMF(u, mu, kappa))
+  mu <- rnorm(2)
+  mu <- mu / sqrt(crossprod(mu)[1])
+  u <- rnorm(length(mu))
+  u <- u / sqrt(crossprod(u)[1])
+  expect_equal(dVMF(u, mu, kappa), VMF(list(u), mu, kappa))
+})
+
 # polr
 links <- c("logistic", "probit", "loglog", "cloglog", "cauchit")
 context("polr")
diff --git a/tests/testthat/test_stan_lm.R b/tests/testthat/test_stan_lm.R
index dcd13e797..9971a370b 100644
--- a/tests/testthat/test_stan_lm.R
+++ b/tests/testthat/test_stan_lm.R
@@ -92,6 +92,11 @@ test_that("stan_lm throws error if glmer syntax used", {
                regexp = "model formula not allowed")
 })
 
+test_that("stan_lmList does not throw an error", {
+  stan_lmList(mpg ~ disp + wt + hp | cyl, data = mtcars, 
+                       prior = R2(0.5, "mean"), chains = 1, iter = 1)
+})
+
 context("stan_aov")
 test_that("stan_aov returns expected result for npk example", {
   fit <- stan_aov(yield ~ block + N*P*K, data = npk, contrasts = "contr.poly",
diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt
index 607cd5771..2a450e1f5 100644
--- a/vignettes/children/SETTINGS-rstan.txt
+++ b/vignettes/children/SETTINGS-rstan.txt
@@ -3,4 +3,5 @@ ITER <- 500L
 CHAINS <- 2L
 CORES <- 2L
 SEED <- 12345
+set.seed(SEED)
 ```
diff --git a/vignettes/lm.Rmd b/vignettes/lm.Rmd
index 3112f5b43..a35628c50 100644
--- a/vignettes/lm.Rmd
+++ b/vignettes/lm.Rmd
@@ -282,35 +282,112 @@ We can compare the two approaches using an approximation to Leave-One-Out
 __loo__ package.
 
 ```{r lm-clouds-loo, warning=TRUE}
-(loo_post <- loo(post))
-(loo(simple))
+(loo_post <- loo(post, k_threshold = 0.7))
+(loo_simple <- loo(simple, k_threshold = 0.7))
 ```
 
 The results indicate that the first approach is expected to produce better
-out-of-sample predictions but the Warning messages are at least as important. 
-Many of the estimated shape parameters for the Generalized Pareto distribution 
-are above $0.5$ in the model with Cauchy priors, which indicates that these 
+out-of-sample predictions. We specified `k_threshold = 0.7` because $7$ of the
+original estimated shape parameters for the Generalized Pareto distribution 
+were above $0.7$ in the model with Cauchy priors, which indicates that these 
 estimates are only going to converge slowly to the true out-of-sample deviance 
 measures. Thus, with only $24$ observations, they should not be considered 
-reliable. The more complicated prior derived above is stronger --- as 
-evidenced by the fact that the effective number of parameters is about half
-of that in the simpler approach and $12$ for the maximum likelihood estimator 
---- and only has $4$ out of $24$ shape estimates in the "danger zone". We 
-might want to reexamine these observations
-
+reliable. By specifying `k_threshold = 0.7`, the model is refit $7$ times, 
+leaving out each of the problematic observations in turn, and the prediction
+of the left-out observation is incorporated into the `elpd_loo`, `p_loo`, and
+`looic` estimates.
+
+The more complicated prior derived above is stronger --- as evidenced by the 
+fact that the effective number of parameters is about $6.3$ as compared to
+$10.3$ for the simpler approach and $12$ for the maximum likelihood estimator.
+A plot reveals that there are a couple of observation whose estimated shape
+parameter for the Generalized Pareto distribution but these can be ignored.
 ```{r lm-clouds-plot-loo}
 plot(loo_post, label_points = TRUE)
 ```
 
-because the posterior is sensitive to them but, overall, the results seem 
-tolerable.
-
 In general, we would expect the joint prior derived here to work better 
 when there are many predictors relative to the number of observations.
 Placing independent, heavy-tailed priors on the coefficients neither
 reflects the beliefs of the researcher nor conveys enough information to
 stabilize all the computations.
 
+# Hierarchical Approach
+
+Another approach would be to allow _all_ the parameters to vary by group.
+In this case, the natural grouping is by treatment and control but the
+`stan_lmList` function --- like the `lmList` function in the __lme4__
+package it is based on --- permits any number of levels on a single grouping
+factor. However, `stan_lmList` differs fundamentally from `lmList` because
+the latter estimates Ordinary Least Squares for each stratum of the data
+defined by the levels of the grouping variable whereas `stan_lmList` estimates
+a model with the data from all groups simultaneously but puts a hierarchical
+prior on the group-specific unknowns.
+
+Recall that one of the primitive parameters in `stan_lm` is $\mathbf{u}$, which
+was assumed to be uniformally distributed on the surface of a hypersphere. For
+`stan_lmList`, we index $\mathbf{u}_j$ by group and place a von Mises-Fisher
+prior on it, whose PDF is
+$$f\left(\mathbf{u}_j | \boldsymbol{\mu}, \kappa \right) = 
+\frac{\kappa^{\frac{K}{2} - 1}}{\left(2 \pi\right)^{\frac{K}{2}} I_{\frac{K}{2} - 1}\left(\kappa\right)}
+e^{\kappa \boldsymbol{\mu}^\top \mathbf{u}_j}$$
+where $\mathbf{\mu}$ is the prior mean on the surface of a hypersphere
+$\kappa \geq 0$ is a concentration parameter, and 
+$I_{\frac{K}{2} - 1}\left(\kappa\right)$ is the modified Bessel function of the
+first kind. If $\kappa = 0$, then this density is constant and the greater is
+$\kappa$, the more each $\mathbf{u}_j$ is shrunk toward $\mathbf{\mu}$. Of course,
+$\mathbf{\mu}$ and $\kappa$ are unknown, need to be estimated, and require 
+priors. The prior for $\mathbf{\mu}$ is uniform on the surface of a hypersphere
+and cannot be changed in `stan_lmList`, while the prior for $\kappa$ is 
+exponential whose mean is specified by the `kappa_mean` argument to `stan_lmList`, 
+which by default is $1$.
+
+We also specify a hierarchical prior on $R_j^2$, which is the proportion of 
+variance in the outcome for the $j$-th group that is explained by the model. Recall
+that in the `stan_lm` case, we could calculate the second shape parameter, $\eta$,
+from the user's prior expectation about $R^2$. In the `stan_lmList` case, we
+calculate $\eta$ from $\lambda = \mathbb{E}\left[R_j^2\right]$ and estimate $\lambda$.
+Consequently, in the `stan_lmList` function, the user needs to express prior
+beliefs about the location of $\lambda$, which can be interpreted as the average
+$R^2$ across groups.
+
+Since $R_j^2$, $\ln \omega_j$, and $s_{Y_j}$ all differ by group, $\sigma_j$
+differs by group. Also, the intercept, $\alpha_j$, differs by group. Thus, when
+the user prints the results of a model estimated by `stan_lmList`, all of the
+parameters are suffixed by `:j` where `j` can be an integer or preferably a
+string indicating the group.
+
+The `formula` argument to the `stan_lmList` function is somewhat like that for
+the `stan_lmer` function, which builds off the `lmer` function in the __lme4__
+package and specifies the grouping variable to the right of a `|`. There is
+more information on this syntax in the [vignette](glmer.html) for generalized
+linear models with group-specific terms. However, for `stan_lmList` only one
+grouping term can (and must) be specified. In the cloud-hacking example, it
+would look like
+```{r, lm-clouds-grouped, results="hide"}
+grouped <- stan_lmList(rainfall ~ sne + cloudcover + prewetness + echomotion
+                       + time | seeding, data = clouds, 
+                       prior = R2(location = 0.2),
+                       chains = CHAINS, cores = CORES, seed = SEED)
+grouped
+```
+```{r, print-lm-clouds-grouped, echo = FALSE}
+grouped
+```
+Notice now that $\sigma$ for the treated group differs (somewhat) from
+$\sigma$ for the control group (at least at the medians) whereas in the 
+previous two models $\sigma$ was constrained to be equal for both groups.
+We can use the `compare` function to compare these models using the LOO
+criterion
+```{r, loo-3}
+compare(loo_post, loo_simple, loo(grouped))
+```
+As can be seen, the model estimated by `stan_lmList` has fewer effective
+parameters than the model estimated by `stan_glm` but more effective
+parameters than the model estimated by `stan_lm`. However, the fit to the
+data is worse with `stan_lmList` in this case so there is no reason to 
+prefer it over `stan_lm`.
+
 # Conclusion
 
 This vignette has discussed the prior distribution utilized in the `stan_lm`

From a79f29fd1508760e22d2c81eaead3a0e10d58263 Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Sat, 16 Jul 2016 15:39:43 -0400
Subject: [PATCH 02/12] stan_lmList.R: add details

---
 R/stan_lmList.R | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/R/stan_lmList.R b/R/stan_lmList.R
index 49176f0a1..725363498 100644
--- a/R/stan_lmList.R
+++ b/R/stan_lmList.R
@@ -38,7 +38,23 @@
 #' @template args-prior_PD
 #' @template args-algorithm
 #' @template args-adapt_delta
+#' @details 
+#' The options for the priors are essentially the same as in \code{\link{stan_lm}}.
+#' However, in this case the \code{prior} argument pertains to the unknown expected
+#' \eqn{R^2} across groups where each group is allowed to have its own \eqn{R^2}
+#' along with every other parameter. The coefficients are partially pooled using
+#' a von Mises-Fisher prior on the group-specific unit vectors. The expecation of
+#' this von Mises-Fisher distribution is uniformally distributed on the surface of
+#' a hypersphere, but the concentration parameter, \code{kappa}, has an exponential
+#' prior with mean specified by the \code{kappa_mean} argument. Higher values for
+#' \code{kappa_mean} induce more pooling and lower values induce less pooling.
 #' 
+#' The posterior distribution for each group's \eqn{R^2} is a \emph{within-group}
+#' quantity, i.e. the proportion of variance in that group that is explained by the
+#' predictors. If the posterior distribution of the intercepts differs from group
+#' to group than the proportion of variance in the \emph{entire sample} that is
+#' explained by the model is not estimated but will be larger than the average
+#' \emph{within-group} \eqn{R^2}.
 #' @examples 
 #' \dontrun{
 #' post <- stan_lmList(mpg ~ disp + wt + hp | cyl, data = mtcars, 

From e24497bb751aeefe0bbb5784bd68627837b76959 Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Sat, 16 Jul 2016 15:54:19 -0400
Subject: [PATCH 03/12] lm.Rmd: clarify effective parameters count

---
 vignettes/lm.Rmd | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vignettes/lm.Rmd b/vignettes/lm.Rmd
index a35628c50..4f9db56fc 100644
--- a/vignettes/lm.Rmd
+++ b/vignettes/lm.Rmd
@@ -384,9 +384,10 @@ compare(loo_post, loo_simple, loo(grouped))
 ```
 As can be seen, the model estimated by `stan_lmList` has fewer effective
 parameters than the model estimated by `stan_glm` but more effective
-parameters than the model estimated by `stan_lm`. However, the fit to the
-data is worse with `stan_lmList` in this case so there is no reason to 
-prefer it over `stan_lm`.
+parameters than the model estimated by `stan_lm` due to the need to
+estimate $\mathbf{\mu}$ and $\kappa$ plus additional $R^2$ parameters. 
+However, the fit to the data is worse with `stan_lmList` in this case so 
+there is no reason to prefer it over `stan_lm`.
 
 # Conclusion
 

From 0a464c4eb3d6df04bf8398324f9ec8af84d044b1 Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Sat, 16 Jul 2016 16:00:31 -0400
Subject: [PATCH 04/12] SETTINGS-rstan.txt: use only 1 core again

---
 vignettes/children/SETTINGS-rstan.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt
index 2a450e1f5..de94c6b3b 100644
--- a/vignettes/children/SETTINGS-rstan.txt
+++ b/vignettes/children/SETTINGS-rstan.txt
@@ -1,7 +1,7 @@
 ```{r, SETTINGS-rstan, include=FALSE}
 ITER <- 500L
 CHAINS <- 2L
-CORES <- 2L
+CORES <- 1L
 SEED <- 12345
 set.seed(SEED)
 ```

From 667d8f9f8209a5fd4664ced1b802002cb6691baf Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Mon, 18 Jul 2016 01:58:32 -0400
Subject: [PATCH 05/12] lm.stan: fix unreasonable memory use

---
 exec/lm.stan     | 47 +++++++++++++++++++++++++++++++++++++++++++----
 vignettes/lm.Rmd |  2 +-
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/exec/lm.stan b/exec/lm.stan
index 59da1d8c2..b031d92fe 100644
--- a/exec/lm.stan
+++ b/exec/lm.stan
@@ -31,25 +31,64 @@ functions {
    */
   real log_besselI(real x, real v) {
     real log_half_x;
+    real lfac_0;
     real lfac;
     real lgam;
+    real lgam_0;
     real lcons;
+    real lcons_0;
     real biggest;
     real piece;
     real smallest;
     real summand;
     int m;
+    int biggest_m;
     if (x < 0) reject("x is assumed to be non-negative")
     if (v < 0) reject("v is assumed to be non-negative");
     log_half_x = log(0.5 * x);
-    lfac = 0;
-    lgam = lgamma(v + 1);
-    lcons = v * log_half_x;
+    lfac_0 = 0;
+    lfac = lfac_0;
+    lgam_0 = lgamma(v + 1);
+    lgam = lgam_0;
+    lcons_0 = v * log_half_x;
+    lcons = lcons_0;
     biggest = -lgam + lcons;
-    piece = positive_infinity();
+    piece = biggest;
     smallest = -745.13321910194122211; // exp(smallest) > 0 minimally
     summand = 0.0;
     m = 1;
+    while (piece >= biggest) { // find maximum for log-sum-exp
+      biggest = piece;
+      lfac = lfac + log(m);
+      lgam = lgam + log(v + m);
+      lcons = lcons + 2 * log_half_x;
+      piece = -lfac - lgam + lcons;
+      m = m + 1;
+    }
+    if (m == 2) summand = exp(piece - biggest);
+    else if (m > 1000) return x - 0.5 * log(2 * pi() * x);
+    else { // start over with interior biggest
+      lfac = lfac_0;
+      lgam = lgam_0;
+      lcons = lcons_0;
+      piece = -lgam + lcons - biggest;
+      summand = exp(piece);
+      biggest_m = m - 2;
+      m = 1;
+      while (m < biggest_m) {
+        lfac = lfac + log(m);
+        lgam = lgam + log(v + m);
+        lcons = lcons + 2 * log_half_x;
+        piece = -lfac - lgam + lcons - biggest;
+        if (piece > smallest) summand = summand + exp(piece);
+        m = m + 1;
+      }
+      // skip over biggest
+      lfac = lfac + log(m);
+      lgam = lgam + log(v + m);
+      lcons = lcons + 2 * log_half_x;
+      m = m + 1;
+    }
     while (piece > smallest) {
       lfac = lfac + log(m);
       lgam = lgam + log(v + m);
diff --git a/vignettes/lm.Rmd b/vignettes/lm.Rmd
index 4f9db56fc..fb54255f8 100644
--- a/vignettes/lm.Rmd
+++ b/vignettes/lm.Rmd
@@ -368,7 +368,7 @@ would look like
 grouped <- stan_lmList(rainfall ~ sne + cloudcover + prewetness + echomotion
                        + time | seeding, data = clouds, 
                        prior = R2(location = 0.2),
-                       chains = CHAINS, cores = CORES, seed = SEED)
+                       chains = CHAINS, cores = CORES, seed = SEED, iter = ITER)
 grouped
 ```
 ```{r, print-lm-clouds-grouped, echo = FALSE}

From c4d5f6e96f704488c8dbafbdb566789a96ebf5ac Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Mon, 18 Jul 2016 10:46:05 -0400
Subject: [PATCH 06/12] SETTINGS-rstan.txt: use 2 cores again

---
 vignettes/children/SETTINGS-rstan.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt
index de94c6b3b..2a450e1f5 100644
--- a/vignettes/children/SETTINGS-rstan.txt
+++ b/vignettes/children/SETTINGS-rstan.txt
@@ -1,7 +1,7 @@
 ```{r, SETTINGS-rstan, include=FALSE}
 ITER <- 500L
 CHAINS <- 2L
-CORES <- 1L
+CORES <- 2L
 SEED <- 12345
 set.seed(SEED)
 ```

From 0e4d6a71c15fb22cffcb82e8834d1ed18d213a12 Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Mon, 18 Jul 2016 10:46:24 -0400
Subject: [PATCH 07/12] .travis.yml: try imposing virtual memory limits

---
 .travis.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index c31f66e26..bdc70bcb2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -19,6 +19,9 @@ before_install:
   - sed -i.bak "s/ g++/ ${CXX}${CLANG_EXTRA_ARG}/" ~/.R/Makevars
   - sed -i.bak "s/O[0-3]/O$CXX_OLEVEL/" ~/.R/Makevars
 
+before_script:
+  - ulimit -v 800000
+
 after_script:
   - tar -ztvf rstanarm_*.tar.gz
   - echo ${NOT_CRAN}

From 3fd8e9b5fbdfc9e23ea637cd3bce87d672ad517e Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Mon, 18 Jul 2016 12:17:47 -0400
Subject: [PATCH 08/12] .travis.yml: try installing matrixStats from the
 develop branch

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index bdc70bcb2..84f948e21 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,6 +5,7 @@ cache: packages
 
 r_github_packages:
   - jimhester/covr
+  - HenrikBengtsson/matrixStats@develop
 
 env:
   matrix: 

From 6aedecc14ae7ca855512c5eb484ab7c792c3396a Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Mon, 18 Jul 2016 13:31:31 -0400
Subject: [PATCH 09/12] .travis.yml: try a different ulimit

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 84f948e21..5abe5a50e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,7 @@ before_install:
   - sed -i.bak "s/O[0-3]/O$CXX_OLEVEL/" ~/.R/Makevars
 
 before_script:
-  - ulimit -v 800000
+  - ulimit -v 7000000
 
 after_script:
   - tar -ztvf rstanarm_*.tar.gz

From f8369734818e1f910e99b955b09e378584091187 Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Mon, 18 Jul 2016 13:53:52 -0400
Subject: [PATCH 10/12] SETTINGS-rstan.txt: try 1 core again

---
 vignettes/children/SETTINGS-rstan.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt
index 2a450e1f5..de94c6b3b 100644
--- a/vignettes/children/SETTINGS-rstan.txt
+++ b/vignettes/children/SETTINGS-rstan.txt
@@ -1,7 +1,7 @@
 ```{r, SETTINGS-rstan, include=FALSE}
 ITER <- 500L
 CHAINS <- 2L
-CORES <- 2L
+CORES <- 1L
 SEED <- 12345
 set.seed(SEED)
 ```

From 9a30101b46e62c42ea43041628eb457bc2c08007 Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Mon, 18 Jul 2016 17:23:21 -0400
Subject: [PATCH 11/12] lm.stan: Reimplement log_besselI() again to reduce
 memory consumption

---
 .travis.yml    |   3 --
 R/stanmodels.R |   2 +
 exec/lm.stan   | 100 ++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 5abe5a50e..3e26074de 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,9 +20,6 @@ before_install:
   - sed -i.bak "s/ g++/ ${CXX}${CLANG_EXTRA_ARG}/" ~/.R/Makevars
   - sed -i.bak "s/O[0-3]/O$CXX_OLEVEL/" ~/.R/Makevars
 
-before_script:
-  - ulimit -v 7000000
-
 after_script:
   - tar -ztvf rstanarm_*.tar.gz
   - echo ${NOT_CRAN}
diff --git a/R/stanmodels.R b/R/stanmodels.R
index 4cd776954..4a1f1895b 100644
--- a/R/stanmodels.R
+++ b/R/stanmodels.R
@@ -28,7 +28,9 @@ stanmodels <- sapply(stan_files, function(f) {
     isystem <- file.path("inst", "chunks")
   if (!file.exists(file.path(isystem, "common_functions.stan")))
     isystem <- file.path("..", "inst", "chunks")
+  sink(tempfile())
   stanfit <- rstan::stanc_builder(f, isystem)
+  sink(NULL)
   stanfit$model_cpp <- list(model_cppname = stanfit$model_name, 
                             model_cppcode = stanfit$cppcode)
   return(do.call(methods::new, args = c(stanfit[-(1:3)], Class = "stanmodel", 
diff --git a/exec/lm.stan b/exec/lm.stan
index b031d92fe..3831c17ef 100644
--- a/exec/lm.stan
+++ b/exec/lm.stan
@@ -25,7 +25,65 @@ functions {
   }
 
   /**
-   * Modded Bessel function of the first kind in log units 
+   * Approximation of lgamma(z); see
+   *
+   * https://en.wikipedia.org/wiki/Stirling%27s_approximation#Versions_suitable_for_calculators
+   *
+   * @param z Positive scalar
+   */
+  real Nemes(real z)
+    return 0.5 * log(6.283185307179586232 / z) + z * (log(z + 1 / (12 * z - 0.1 / z)) - 1);
+      
+  int signum(real x)
+    return x < 0 ? -1 : x > 0;
+
+  /**
+   * Customized version of the bisection method to find when log-sum-exp can terminate; see
+   *
+   * https://en.wikipedia.org/wiki/Bisection_method#Algorithm
+   *
+   * This is only called from the log_besselI() function below
+   *
+   * @param v Order for log_besselI() function
+   * @param biggest Pivot in the log-sum-exp
+   * @param biggest_m Integer indicating what iteration reaches the pivot
+   * @param log_half_x Real number equal to log(0.5 * x)
+   */
+  int bisection(real v, real biggest, int biggest_m, real log_half_x) {
+    int c;
+    int a;
+    int b;
+    int N;
+    real smallest;
+    real val_a;
+    a = biggest_m;
+    b = 2147483647; // maximum integer
+    N = 1;
+    smallest = -745.13321910194122211;
+    val_a = -smallest;
+    while (N <= 100) {
+      real val;
+      c = a / 2 + b / 2;
+      if (a == (b - 1)) return c;
+      else {
+        val = -2 * Nemes(c) + (2 * c + v) * log_half_x - biggest - smallest;
+        if (round(val) == 0) return c;
+        else if (signum(val) == signum(val_a)) {
+          a = c;
+          val_a = val;
+        }
+        else b = c;
+        N = N + 1;
+      }
+    }
+    return c;
+  }
+  
+  /**
+   * Modded Bessel function of the first kind in log units
+   *
+   * This implementation is designed to not overflow or eat too much RAM
+   *
    * @param x Real non-negative scalar
    * @param v Real non-negative order but may be integer
    */
@@ -40,11 +98,11 @@ functions {
     real biggest;
     real piece;
     real smallest;
-    real summand;
     int m;
-    int biggest_m;
+    int cutoff;
     if (x < 0) reject("x is assumed to be non-negative")
     if (v < 0) reject("v is assumed to be non-negative");
+    if (x > 700) return x - 0.5 * log(2 * pi() * x);
     log_half_x = log(0.5 * x);
     lfac_0 = 0;
     lfac = lfac_0;
@@ -55,9 +113,8 @@ functions {
     biggest = -lgam + lcons;
     piece = biggest;
     smallest = -745.13321910194122211; // exp(smallest) > 0 minimally
-    summand = 0.0;
     m = 1;
-    while (piece >= biggest) { // find maximum for log-sum-exp
+    while (piece >= biggest) { // find pivot for log-sum-exp
       biggest = piece;
       lfac = lfac + log(m);
       lgam = lgam + log(v + m);
@@ -65,41 +122,30 @@ functions {
       piece = -lfac - lgam + lcons;
       m = m + 1;
     }
-    if (m == 2) summand = exp(piece - biggest);
-    else if (m > 1000) return x - 0.5 * log(2 * pi() * x);
-    else { // start over with interior biggest
+    // find next m such that exp(...) = 0
+    cutoff = bisection(v, biggest, m - 1, log_half_x);
+    { // start over from m = 0 and fill up summands
+      vector[cutoff + 1] summands;
       lfac = lfac_0;
       lgam = lgam_0;
       lcons = lcons_0;
       piece = -lgam + lcons - biggest;
-      summand = exp(piece);
-      biggest_m = m - 2;
+      summands[1] = exp(piece);
       m = 1;
-      while (m < biggest_m) {
+      while (m <= cutoff) {
         lfac = lfac + log(m);
         lgam = lgam + log(v + m);
         lcons = lcons + 2 * log_half_x;
         piece = -lfac - lgam + lcons - biggest;
-        if (piece > smallest) summand = summand + exp(piece);
         m = m + 1;
+        summands[m] = exp(piece);
       }
-      // skip over biggest
-      lfac = lfac + log(m);
-      lgam = lgam + log(v + m);
-      lcons = lcons + 2 * log_half_x;
-      m = m + 1;
-    }
-    while (piece > smallest) {
-      lfac = lfac + log(m);
-      lgam = lgam + log(v + m);
-      lcons = lcons + 2 * log_half_x;
-      piece = -lfac - lgam + lcons - biggest;
-      summand = summand + exp(piece);
-      m = m + 1;
+      return biggest + log(sum(summands));
     }
-    return biggest + log1p(summand);
+    reject ("should never reach here");
+    return negative_infinity();
   }
-
+  
   /**
    * von Mises-Fisher distribution in log units 
    * @param u J-array of unit vectors

From ea566921712e56fa0300c2cc1a6d24d6009fa42c Mon Sep 17 00:00:00 2001
From: Ben Goodrich <goodrich.ben@gmail.com>
Date: Mon, 18 Jul 2016 21:30:58 -0400
Subject: [PATCH 12/12] stanmodels.R: really suppress parser warnings

---
 R/stanmodels.R                        | 6 +++---
 exec/lm.stan                          | 6 ++----
 vignettes/children/SETTINGS-rstan.txt | 2 +-
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/R/stanmodels.R b/R/stanmodels.R
index 4a1f1895b..db20d9b67 100644
--- a/R/stanmodels.R
+++ b/R/stanmodels.R
@@ -28,9 +28,9 @@ stanmodels <- sapply(stan_files, function(f) {
     isystem <- file.path("inst", "chunks")
   if (!file.exists(file.path(isystem, "common_functions.stan")))
     isystem <- file.path("..", "inst", "chunks")
-  sink(tempfile())
-  stanfit <- rstan::stanc_builder(f, isystem)
-  sink(NULL)
+  tf <- tempfile()
+  capture.output(stanfit <- rstan::stanc_builder(f, isystem), 
+                 file = tf, type = "message")
   stanfit$model_cpp <- list(model_cppname = stanfit$model_name, 
                             model_cppcode = stanfit$cppcode)
   return(do.call(methods::new, args = c(stanfit[-(1:3)], Class = "stanmodel", 
diff --git a/exec/lm.stan b/exec/lm.stan
index 3831c17ef..f59a7ced4 100644
--- a/exec/lm.stan
+++ b/exec/lm.stan
@@ -25,7 +25,7 @@ functions {
   }
 
   /**
-   * Approximation of lgamma(z); see
+   * Approximation of lgamma(z) for not-too-small z; see
    *
    * https://en.wikipedia.org/wiki/Stirling%27s_approximation#Versions_suitable_for_calculators
    *
@@ -59,7 +59,7 @@ functions {
     a = biggest_m;
     b = 2147483647; // maximum integer
     N = 1;
-    smallest = -745.13321910194122211;
+    smallest = -745.13321910194122211; // exp(smallest) > 0 minimally
     val_a = -smallest;
     while (N <= 100) {
       real val;
@@ -97,7 +97,6 @@ functions {
     real lcons_0;
     real biggest;
     real piece;
-    real smallest;
     int m;
     int cutoff;
     if (x < 0) reject("x is assumed to be non-negative")
@@ -112,7 +111,6 @@ functions {
     lcons = lcons_0;
     biggest = -lgam + lcons;
     piece = biggest;
-    smallest = -745.13321910194122211; // exp(smallest) > 0 minimally
     m = 1;
     while (piece >= biggest) { // find pivot for log-sum-exp
       biggest = piece;
diff --git a/vignettes/children/SETTINGS-rstan.txt b/vignettes/children/SETTINGS-rstan.txt
index de94c6b3b..2a450e1f5 100644
--- a/vignettes/children/SETTINGS-rstan.txt
+++ b/vignettes/children/SETTINGS-rstan.txt
@@ -1,7 +1,7 @@
 ```{r, SETTINGS-rstan, include=FALSE}
 ITER <- 500L
 CHAINS <- 2L
-CORES <- 1L
+CORES <- 2L
 SEED <- 12345
 set.seed(SEED)
 ```