amices · stefvanbuuren · Aug 8, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 7, 2023
diff --git a/NAMESPACE b/NAMESPACE
@@ -216,6 +216,7 @@ importFrom(stats,C)
 importFrom(stats,aggregate)
 importFrom(stats,as.formula)
 importFrom(stats,binomial)
+importFrom(stats,cancor)
 importFrom(stats,coef)
 importFrom(stats,complete.cases)
 importFrom(stats,confint)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,18 @@
+**Imputing categorical data by predictive mean matching**. Predictive mean matching (PMM) is the default method of `mice()` for imputing numerical variables, but it has long been possible to impute factors. This enhancement introduces better support to work with categorical variables in PMM. The **former system** translated factors into integers by `ynum <- as.integer(f)`. However, the order of integers in `ynum` may have no sensible interpretation for an unordered factor. The **new system** quantifies `ynum` and could yield better results because of higher $R^2$. The method calculates the canonical correlation between `y` (as dummy matrix) and a linear combination of imputation model predictors `x`. The algorithm then replaces each category of `y` by a single number taken from the first canonical variate. After this step, the imputation model is fitted, and the predicted values from that model are extracted to function as the similarity measure for the matching step.
+
+The method works for both ordered and unordered factors. No special precautions are taken to ensure monotonicity between the category numbers and the quantifications, so the method should be able to preserve quadratic and other non-monotone relations of the predicted metric. It may be beneficial to remove very sparsely filled categories, for which there is a new `trim` argument. 
+
+All you have to use the new technique is specify to `mice(..., method = "pmm", ...)`. Both numerical and categorical variables will then be imputed by PMM.
+
+Potential advantages are:
+
+- Simpler and faster than fitting a generalised linear model, e.g., logistic regression or the proportional odds model;
+- Should be insensitive to the order of categories;
+- No need to solve problems with perfect prediction;
+- Should inherit the good statistical properties of predictive mean matching.
+
+Note that we still lack solid evidence for these claims. (#576). Contributed @stefvanbuuren
+
 # mice 3.16.3
 
 * **New system-independent method for pooling**: This version introduces a new function `pool.table()` that takes a tidy table of parameter estimates stemming from `m` repeated analyses. The input data must consist of three columns (parameter name, estimate, standard error) and a specification of the degrees of freedom of the model fitted to the complete data. The `pool.table()` function outputs 14 pooled statistics in a tidy form. The primary use of `pool.table()` is to support parameter pooling for techiques that have no `tidy()` or `glance()` methods, either within `R` or outside `R`. The `pool.table()` function also allows for a novel workflows that 1) break apart the traditional `pool()` function into a data-wrangling part and a parameters-reducing part, and 2) does not necessarily depend on classed R objects. (#574). Contributed @stefvanbuuren

diff --git a/R/imports.R b/R/imports.R
@@ -11,7 +11,7 @@
 #' @importFrom Rcpp     evalCpp
 #' @importFrom rlang    .data syms
 #' @importFrom rpart    rpart rpart.control
-#' @importFrom stats    C aggregate as.formula binomial coef
+#' @importFrom stats    C aggregate as.formula binomial cancor coef
 #'                      complete.cases confint
 #'                      contr.treatment cor df.residual fitted
 #'                      formula gaussian getCall

diff --git a/R/mice.impute.pmm.R b/R/mice.impute.pmm.R
@@ -8,7 +8,16 @@
 #' (\code{TRUE}) and missing values (\code{FALSE}) in \code{y}.
 #' @param x Numeric design matrix with \code{length(y)} rows with predictors for
 #' \code{y}. Matrix \code{x} may have no missing values.
-#' @param exclude Value or vector of values to exclude from the imputation donor pool in \code{y}
+#' @param exclude Dependent values to exclude from the imputation model
+#' and the collection of donor values
+#' @param quantify Logical. If \code{TRUE}, factor levels are replaced
+#' by the first canonical variate before fitting the imputation model.
+#' If false, the procedure reverts to the old behaviour and takes the
+#' integer codes (which may lack a sensible interpretation).
+#' Relevant only of \code{y} is a factor.
+#' @param trim Scalar integer. Minimum number of observations required in a
+#' category in order to be considered as a potential donor value.
+#' Relevant only of \code{y} is a factor.
 #' @param wy Logical vector of length \code{length(y)}. A \code{TRUE} value
 #' indicates locations in \code{y} for which imputations are created.
 #' @param donors The size of the donor pool among which a draw is made.
@@ -116,33 +125,72 @@
 #' imp <- mice(boys, method = "pmm", print = FALSE, blots = blots, seed=123)
 #' blots$hgt$exclude %in% unlist(c(imp$imp$hgt)) # MUST be all FALSE
 #' blots$tv$exclude %in% unlist(c(imp$imp$tv)) # MUST be all FALSE
+#'
+#' # Factor quantification
+#' xname <- c("age", "hgt", "wgt")
+#' br <- boys[c(1:10, 101:110, 501:510, 601:620, 701:710), ]
+#' r <- stats::complete.cases(br[, xname])
+#' x <- br[r, xname]
+#' y <- factor(br[r, "tv"])
+#' ry <- !is.na(y)
+#' table(y)
+#'
+#' # impute factor by optimizing canonical correlation y, x
+#' mice.impute.pmm(y, ry, x)
+#'
+#' # only categories with at least 2 cases can be donor
+#' mice.impute.pmm(y, ry, x, trim = 2L)
+#'
+#' # in addition, eliminate category 20
+#' mice.impute.pmm(y, ry, x, trim = 2L, exclude = 20)
+#'
+#' # to get old behavior: as.integer(y))
+#' mice.impute.pmm(y, ry, x, quantify = FALSE)
 #' @export
 mice.impute.pmm <- function(y, ry, x, wy = NULL, donors = 5L,
-                            matchtype = 1L, exclude = -99999999, ridge = 1e-05,
-                            use.matcher = FALSE, ...) {
-  id.ex <- !ry | !y %in% exclude # id vector for exclusion
-  y <- y[id.ex] # leave out the exclude vector y's
-  # allow for one-dimensional x-space
-  if(!is.null(dim(x))){
-    x <- x[id.ex, ]
-  } else {
-    x <- x[id.ex]
+                            matchtype = 1L, exclude = NULL,
+                            quantify = TRUE, trim = 1L,
+                            ridge = 1e-05, use.matcher = FALSE, ...) {
+  if (is.null(wy)) {
+    wy <- !ry
   }
-  # leave out the exclude vector x's
-  ry <- ry[id.ex] # leave out the exclude vector indicator
-  {
-    if (is.null(wy)) {
-      wy <- !ry
-    } else {
-      wy <- wy[id.ex] # if applicable adjust wy to match exclude
-    }
+
+  # Reformulate the imputation problem such that
+  # 1. the imputation model disregards records with excluded y-values
+  # 2. the donor set does not contain excluded y-values
+
+  # Keep sparse categories out of the imputation model
+  if (is.factor(y)) {
+    active <- !ry | y %in% (levels(y)[table(y) >= trim])
+    y <- y[active]
+    ry <- ry[active]
+    x <- x[active, , drop = FALSE]
+    wy <- wy[active]
+  }
+  # Keep excluded values out of the imputation model
+  if (!is.null(exclude)) {
+    active <- !ry | !y %in% exclude
+    y <- y[active]
+    ry <- ry[active]
+    x <- x[active, , drop = FALSE]
+    wy <- wy[active]
   }
+
   x <- cbind(1, as.matrix(x))
+
+  # quantify categories for factors
   ynum <- y
   if (is.factor(y)) {
-    ynum <- as.integer(y)
+    if (quantify) {
+      ynum <- quantify(y, ry, x)
+    } else {
+      ynum <- as.integer(y)
+    }
   }
+
+  # parameter estimation
   parm <- .norm.draw(ynum, ry, x, ridge = ridge, ...)
+
   if (matchtype == 0L) {
     yhatobs <- x[ry, , drop = FALSE] %*% parm$coef
     yhatmis <- x[wy, , drop = FALSE] %*% parm$coef
@@ -160,6 +208,7 @@ mice.impute.pmm <- function(y, ry, x, wy = NULL, donors = 5L,
   } else {
     idx <- matchindex(yhatobs, yhatmis, donors)
   }
+
   return(y[ry][idx])
 }
 
@@ -211,3 +260,15 @@ mice.impute.pmm <- function(y, ry, x, wy = NULL, donors = 5L,
   m <- sample(y[d <= ds[donors]], 1)
   return(m)
 }
+
+quantify <- function(y, ry, x) {
+  # replaces (reduced set of) categories by optimal scaling
+  yf <- factor(y[ry], exclude = NULL)
+  yd <- model.matrix(~ 0 + yf)
+  xd <- x[ry, , drop = FALSE]
+  cca <- cancor(yd, xd, xcenter = FALSE, ycenter = FALSE)
+  ynum <- as.integer(y)
+  ynum[ry] <- scale(as.vector(yd %*% cca$xcoef[, 2L]))
+  # plot(y[ry], ynum[ry])
+  return(ynum)
+}
diff --git a/man/mice.impute.pmm.Rd b/man/mice.impute.pmm.Rd
diff --git a/tests/testthat/test-mice.impute.pmm.R b/tests/testthat/test-mice.impute.pmm.R
@@ -29,3 +29,86 @@ imp2 <- mice(nhanes, printFlag = FALSE, seed = 123, exclude = c(-1, 1032))
 test_that("excluding unobserved values does not impact pmm", {
   expect_identical(imp1$imp, imp2$imp)
 })
+
+context("optimal scaling")
+
+# Factor quantification
+y <- factor(br[r, "tv"])
+
+# impute factor by optimizing canonical correlation y, x
+data1 <- data.frame(y, x)
+test_that("cancor proceeds normally", {
+  expect_silent(imp1 <- mice(data1, method = "pmm", remove.collinear = FALSE, eps = 0,
+                             maxit = 1, m = 1, print = FALSE, seed = 1))
+})
+
+# > cca$xcoef[, 2L]
+#    yf6     yf8    yf10    yf12    yf13    yf15    yf16    yf20    yf25
+# 0.8458 -0.0469 -0.3182 -0.3458  0.0825  0.0799  0.0383 -0.0517 -0.0460
+
+# include duplicate x
+data2 <- data1
+data2$age2 <- data2$age
+data2$age3 <- data2$age
+data2$age4 <- data2$age
+data2$age5 <- data2$age
+data2$age6 <- data2$age
+data2$age7 <- data2$age
+data2$age8 <- data2$age
+data2$age9 <- data2$age
+data2$age10 <- data2$age
+data2$age11 <- data2$age
+data2$age12 <- data2$age
+data2$age13 <- data2$age
+data2$age14 <- data2$age
+data2$age15 <- data2$age
+data2$age16 <- data2$age
+data2$age17 <- data2$age
+data2$age18 <- data2$age
+data2$age19 <- data2$age
+data2$age20 <- data2$age
+data2$age21 <- data2$age
+data2$age22 <- data2$age
+data2$age23 <- data2$age
+data2$age24 <- data2$age
+data2$age25 <- data2$age
+
+# impute factor by optimizing canonical correlation y, x
+test_that("cancor proceeds normally with many duplicates", {
+  expect_warning(imp2 <- mice(data2, method = "pmm", remove.collinear = FALSE, eps = 0,
+                             maxit = 1, m = 1, seed = 1, print = FALSE))
+})
+
+# add junk variables
+data3 <- data1
+data3$j1 <- rnorm(nrow(data3))
+data3$j2 <- rnorm(nrow(data3))
+data3$j3 <- rnorm(nrow(data3))
+data3$j4 <- rnorm(nrow(data3))
+data3$j5 <- rnorm(nrow(data3))
+data3$j6 <- rnorm(nrow(data3))
+data3$j7 <- rnorm(nrow(data3))
+data3$j8 <- rnorm(nrow(data3))
+data3$j9 <- rnorm(nrow(data3))
+data3$j10 <- rnorm(nrow(data3))
+data3$j11 <- rnorm(nrow(data3))
+data3$j12 <- rnorm(nrow(data3))
+data3$j13 <- rnorm(nrow(data3))
+data3$j14 <- rnorm(nrow(data3))
+data3$j15 <- rnorm(nrow(data3))
+data3$j16 <- rnorm(nrow(data3))
+data3$j17 <- rnorm(nrow(data3))
+data3$j18 <- rnorm(nrow(data3))
+data3$j19 <- rnorm(nrow(data3))
+data3$j20 <- rnorm(nrow(data3))
+data3$j21 <- rnorm(nrow(data3))
+data3$j22 <- rnorm(nrow(data3))
+data3$j23 <- rnorm(nrow(data3))
+data3$j24 <- rnorm(nrow(data3))
+data3$j25 <- rnorm(nrow(data3))
+
+
+test_that("cancor with many junk variables does not crash", {
+  expect_warning(imp3 <- mice(data3, method = "pmm", remove.collinear = FALSE, eps = 0,
+                              maxit = 1, m = 1, seed = 1, print = FALSE))
+})