Merge pull request #525 from SebKrantz/development

Development
SebKrantz · Jan 11, 2024 · 6558a31 · 6558a31
2 parents 2239d2f + a0bcc61
commit 6558a31
Show file tree

Hide file tree

Showing 8 changed files with 36 additions and 24 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: collapse
 Title: Advanced and Fast Data Transformation
 Version: 2.0.9
-Date: 2024-01-10
+Date: 2024-01-11
 Authors@R: c(
            person("Sebastian", "Krantz", role = c("aut", "cre"), 
                   email = "[email protected]", 

diff --git a/NAMESPACE b/NAMESPACE
@@ -345,6 +345,8 @@ importFrom("stats", "as.formula", "complete.cases", "cor", "cov", "var", "pt",
  export(allNA)
  export(missing_cases)
  export(na_rm)
+ export(na_locf)
+ export(na_focb)
  export(na_omit)
  export(na_insert)
  export(massign)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,6 @@
 # collapse 2.0.9
 
-* `replace_na()` now has a `type` argument which supports options `"locf"` and `"focb"` (default `"const"`), similar to `data.table::nafill`. The `replace_na()` implementation also supports character data and list-columns (`NULL/empty` elements). Thanks @BenoitLondon for suggesting (#489). 
+* Added functions `na_locf()` and `na_focb()` for fast basic C implementations of these procedures (optionally by reference). `replace_na()` now also has a `type` argument which supports options `"locf"` and `"focb"` (default `"const"`), similar to `data.table::nafill`. The implementation also supports character data and list-columns (`NULL/empty` elements). Thanks @BenoitLondon for suggesting (#489). I note that `na_locf()` exists in some other packages (such as *imputeTS*) where it is implemented in R and has additional options. Users should utilize the flexible namespace i.e. `set_collapse(remove = "na_locf")` to deal with this. 
 
 * Fixed a bug in weighted quantile estimation (`fquantile()`) that could lead to wrong/out-of-range estimates in some cases. Thanks @zander-prinsloo for reporting (#523). 
 

diff --git a/R/recode_replace.R b/R/recode_replace.R
@@ -239,10 +239,14 @@ recode_char <- function(X, ..., default = NULL, missing = NULL, regex = FALSE,
 }
 
 
-na_locf <- function(x, ph1, ph2, set = FALSE) .Call(C_na_locf_focb, x, 1L, set)
-na_focb <- function(x, ph1, ph2, set = FALSE) .Call(C_na_locf_focb, x, 2L, set)
+na_locf <- function(x, set = FALSE) .Call(C_na_locf, x, set)
+na_focb <- function(x, set = FALSE) .Call(C_na_focb, x, set)
+
+na_locf_ph <- function(x, ph1, ph2, set = FALSE) .Call(C_na_locf, x, set)
+na_focb_ph <- function(x, ph1, ph2, set = FALSE) .Call(C_na_focb, x, set)
+
 replace_na <- function(X, value = 0L, cols = NULL, set = FALSE, type = "const") {
-  FUN <- switch(type, const =, value = scv, locf = na_locf, focb = na_focb,
+  FUN <- switch(type, const =, value = scv, locf = na_locf_ph, focb = na_focb_ph,
                 stop("Unknown type:", type))
   if(set) {
     if(is.list(X)) {

diff --git a/man/efficient-programming.Rd b/man/efficient-programming.Rd
@@ -27,6 +27,8 @@
 \alias{fdim}
 \alias{missing_cases}
 \alias{na_rm}
+\alias{na_locf}
+\alias{na_focb}
 \alias{na_omit}
 \alias{na_insert}
 \alias{seq_row}
@@ -61,8 +63,9 @@ X \%+=\% V                    # Infix for setop(X, "+", V). See also Note (2)
 X \%-=\% V                    # Infix for setop(X, "-", V). See also Note (2)
 X \%*=\% V                    # Infix for setop(X, "*", V). See also Note (2)
 X \%/=\% V                    # Infix for setop(X, "/", V). See also Note (2)
-na_rm(x)                    # Fast: if(anyNA(x)) x[!is.na(x)] else x,
-                            # also removes NULL / empty elements from list
+na_rm(x)                    # Fast: if(anyNA(x)) x[!is.na(x)] else x, last
+na_locf(x, set = FALSE)     # obs. carried forward and first obs. carried back.
+na_focb(x, set = FALSE)     # (by reference). These also support lists (NULL/empty)
 na_omit(X, cols = NULL,     # Faster na.omit for matrices and data frames,
         na.attr = FALSE,    # can use selected columns to check, attach indices,
         prop = 0, ...)      # and remove cases with a proportion of values missing
@@ -88,6 +91,7 @@ cinv(x)                     # Choleski (fast) inverse of symmetric PD matrix, e.
   \item{x, v}{a (atomic) vector or matrix (\code{na_rm} also supports lists).}
   \item{value}{a single value of any (atomic) vector type. For \code{whichv} it can also be a \code{length(x)} vector.}
   \item{invert}{logical. \code{TRUE} considers elements \code{x != value}.}
+  \item{set}{logical. \code{TRUE} transforms \code{x} by reference.}
   \item{simplify}{logical. If \code{value} is a length-1 atomic vector, \code{alloc()} with \code{simplify = TRUE} returns a length-n atomic vector. If \code{simplify = FALSE}, the result is always a list.}
   \item{vind1}{logical. If \code{length(v) == 1L}, setting \code{vind1 = TRUE} will interpret \code{v} as an index, rather than a value to search and replace.}
   \item{xlist}{logical. If \code{X} is a list, the default is to treat it like a data frame and replace rows. Setting \code{xlist = TRUE} will treat \code{X} and its replacement \code{R} like 1-dimensional list vectors.}

diff --git a/src/ExportSymbols.c b/src/ExportSymbols.c
@@ -131,7 +131,8 @@ static const R_CallMethodDef CallEntries[] = {
   {"C_pivot_wide", (DL_FUNC) &pivot_wide, 5},
   {"C_sort_merge_join", (DL_FUNC) &sort_merge_join, 4},
   {"C_replace_outliers", (DL_FUNC) &replace_outliers, 5},
-  {"C_na_locf_focb", (DL_FUNC) &na_locf_focb, 3},
+  {"C_na_locf", (DL_FUNC) &na_locf, 2},
+  {"C_na_focb", (DL_FUNC) &na_focb, 2},
   // {"C_aschar", (DL_FUNC) &CasChar, 1},
   {"C_subsetDT", (DL_FUNC) &subsetDT, 4},
   {"C_subsetVector", (DL_FUNC) &subsetVector, 3},

diff --git a/src/collapse_c.h b/src/collapse_c.h
@@ -101,7 +101,8 @@ SEXP pivot_long(SEXP data, SEXP ind, SEXP idcol);
 SEXP pivot_wide(SEXP index, SEXP id, SEXP column, SEXP fill, SEXP Rnthreads);
 SEXP sort_merge_join(SEXP x, SEXP table, SEXP ot, SEXP count);
 SEXP replace_outliers(SEXP x, SEXP limits, SEXP value, SEXP single_limit, SEXP set);
-SEXP na_locf_focb(SEXP x, SEXP Rtype, SEXP Rset);
+SEXP na_locf(SEXP x, SEXP Rset);
+SEXP na_focb(SEXP x, SEXP Rset);
 SEXP multi_match(SEXP m, SEXP g);
 // fnobs rewritten in C:
 SEXP fnobsC(SEXP x, SEXP Rng, SEXP g);

diff --git a/src/programming.c b/src/programming.c
@@ -725,8 +725,11 @@ SEXP replace_outliers(SEXP x, SEXP limits, SEXP value, SEXP single_limit, SEXP s
   return res;
 }
 
-void na_locf(SEXP x) {
-  int n = length(x);
+SEXP na_locf(SEXP x, SEXP Rset) {
+  int n = length(x), copy = asLogical(Rset) == 0;
+  if(isMatrix(x)) warning("na_locf() does not (yet) have explicit support for matrices, i.e., it treats a matrix as a single vector. Use dapply(M, na_locf) if column-wise processing is desired");
+  if(copy) x = PROTECT(shallow_duplicate(x));
+
   switch (TYPEOF(x)) {
   case INTSXP:
   case LGLSXP:
@@ -782,12 +785,17 @@ void na_locf(SEXP x) {
     break;
   }
   default:
-    error("na_locf does not support type '%s'", type2char(TYPEOF(x)));
+    error("na_locf() does not support type '%s'", type2char(TYPEOF(x)));
   }
+  UNPROTECT(copy);
+  return x;
 }
 
-void na_focb(SEXP x) {
-  int n = length(x);
+SEXP na_focb(SEXP x, SEXP Rset) {
+  int n = length(x), copy = asLogical(Rset) == 0;
+  if(isMatrix(x)) warning("na_focb() does not (yet) have explicit support for matrices, i.e., it treats a matrix as a single vector. Use dapply(M, na_focb) if column-wise processing is desired");
+  if(copy) x = PROTECT(shallow_duplicate(x));
+
   switch (TYPEOF(x)) {
   case INTSXP:
   case LGLSXP:
@@ -843,21 +851,13 @@ void na_focb(SEXP x) {
     break;
   }
   default:
-    error("na_focb does not support type '%s'", type2char(TYPEOF(x)));
+    error("na_focb() does not support type '%s'", type2char(TYPEOF(x)));
   }
-}
-
-SEXP na_locf_focb(SEXP x, SEXP Rtype, SEXP Rset) {
-  int copy = asLogical(Rset) == 0, type = asInteger(Rtype);
-  if(isMatrix(x)) warning("na_locf/focb do not yet have explicit support for matrices, i.e., they treat a matrix as a single vector. Use dapply(M, replace_na, type = 'locf') if column-wise processing is desired");
-  if(copy) x = PROTECT(shallow_duplicate(x));
-  if(type == 1) na_locf(x);
-  else if(type == 2) na_focb(x);
-  else error("Internal error, unknown locf/focb 'type': %d", type);
   UNPROTECT(copy);
   return x;
 }
 
+
 SEXP vtypes(SEXP x, SEXP isnum) {
   int tx = TYPEOF(x);
   if(tx != VECSXP) return ScalarInteger(tx);