diff --git a/.Rbuildignore b/.Rbuildignore index e4862de..1e3d35c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,5 +1,6 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.github$ ^\.travis\.yml$ ^appveyor\.yml$ ^codecov\.yml$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml new file mode 100644 index 0000000..5f2d726 --- /dev/null +++ b/.github/workflows/R-CMD-check.yml @@ -0,0 +1,66 @@ +on: + push: + branches: + - master + - devel + pull_request: + branches: + - master + - devel + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: ubuntu-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Checkout repo + uses: actions/checkout@v2 + + - name: Setup R + uses: r-lib/actions/setup-r@master + with: + r-version: ${{ matrix.config.r }} + + - name: Install pandoc + uses: r-lib/actions/setup-pandoc@v1 + + - name: Install tinyTeX + uses: r-lib/actions/setup-tinytex@v1 + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + sudo apt install -y curl libcurl4-doc libcurl4-openssl-dev + + - name: Install package dependencies + run: | + install.packages(c("remotes", "rcmdcheck", "covr", "sessioninfo")) + remotes::install_deps(dependencies = TRUE) + shell: Rscript {0} + + - name: Check package + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") + shell: Rscript {0} + + - name: Upload code coverage + if: runner.os == 'Linux' + run: | + covr::codecov() + shell: Rscript {0} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 479fdec..0000000 --- a/.travis.yml +++ /dev/null @@ -1,36 +0,0 @@ -branches: - only: - - master - -env: - global: - - RGL_USE_NULL=TRUE - -language: r -sudo: required -cache: packages -cran: https://cran.rstudio.com -warnings_are_errors: true - -before_install: - Rscript -e 'update.packages(ask = FALSE)' - -r_packages: - - covr - - devtools - -r: - - release - - devel - -r_github_packages: - - r-lib/covr - - r-lib/sessioninfo - -after_success: - - Rscript -e 'covr::codecov()' - -notifications: - email: - on_success: change - on_failure: change diff --git a/DESCRIPTION b/DESCRIPTION index c36ba66..06a3e18 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: origami Title: Generalized Framework for Cross-Validation -Version: 1.0.4 +Version: 1.0.5 Authors@R: c( person("Jeremy", "Coyle", email = "jeremyrcoyle@gmail.com", role = c("aut", "cre", "cph"), @@ -39,8 +39,9 @@ Suggests: rmarkdown, knitr, stringr, + glmnet, forecast, randomForest LazyData: true VignetteBuilder: knitr -RoxygenNote: 7.1.1.9000 +RoxygenNote: 7.1.2 diff --git a/NEWS.md b/NEWS.md index c72a0da..6e90442 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# origami 1.0.5 +* Addition of `glmnet` to package `Suggests` since used in the vignette. +* Resolve issue stemming from the partial matching of argument names + (https://github.com/tlverse/origami/issues/56) by specifying throughout. + # origami 1.0.4 * Fix the incorrect use of `future.apply` by including `future.seed = TRUE`, as per https://github.com/tlverse/origami/issues/48. @@ -6,10 +11,10 @@ * Sped up all pooled time-series fold functions with `lapply`. * All pooled time-series fold functions (`folds_rolling_origin_pooled`, `folds_rolling_window_pooled`, `folds_vfold_rolling_origin_pooled`, - `folds_vfold_rolling_window_pooled`) now allow for variability in the - number of observations for each independent unit (i.e. subject). + `folds_vfold_rolling_window_pooled`) now allow for variability in the + number of observations for each independent unit (i.e., the subject). * Added test for pooled time-series cross-validation with multi-unit time-series - to test the update above. In this test, the total number of observations + to test the update above. In this test, the total number of observations and the time-points for which there are observations varies across the units. # origami 1.0.3 diff --git a/R/fold_funs.R b/R/fold_funs.R index 67db76c..6399377 100644 --- a/R/fold_funs.R +++ b/R/fold_funs.R @@ -45,7 +45,7 @@ folds_vfold <- function(n, V = 10L) { warning("n <= V so using leave-one-out CV") return(folds_loo(n)) } - folds <- rep(seq_len(V), length = n) + folds <- rep(x = seq_len(V), length.out = n) # shuffle folds folds <- sample(folds) @@ -157,7 +157,7 @@ folds_rolling_origin_pooled <- function(n, t, id = NULL, time = NULL, # make IDs the same length as time if only one ID provided if (length(id) == 1L) { - id <- rep(id, length(time)) + id <- rep(x = id, length.out = length(time)) } # make skeleton dataset with IDs and times @@ -202,7 +202,7 @@ folds_rolling_window_pooled <- function(n, t, id = NULL, time = NULL, # make IDs the same length as time if only one ID provided if (length(id) == 1L) { - id <- rep(id, length(time)) + id <- rep(x = id, length.out = length(time)) } # make skeleton dataset with IDs and times @@ -249,7 +249,7 @@ folds_vfold_rolling_origin_pooled <- function(n, t, id = NULL, time = NULL, # make IDs the same length as time if only one ID provided if (length(id) == 1L) { - id <- rep(id, length(time)) + id <- rep(x = id, length.out = length(time)) } # make skeleton dataset with IDs and times @@ -268,7 +268,7 @@ folds_vfold_rolling_origin_pooled <- function(n, t, id = NULL, time = NULL, message(paste("Processing", length(ids), "samples with", t, "time points.")) # establish V folds for cross-validating ids - Vfold_allocation <- sample(rep(seq_len(V), length = length(ids))) + Vfold_allocation <- sample(rep(x = seq_len(V), length.out = length(ids))) Vfolds_skeleton <- lapply(seq_len(V), fold_from_foldvec, Vfold_allocation) # establish rolling origin forecast for time-series cross-validation @@ -316,7 +316,7 @@ folds_vfold_rolling_window_pooled <- function(n, t, id = NULL, time = NULL, # make IDs the same length as time if only one ID provided if (length(id) == 1) { - id <- rep(id, length(time)) + id <- rep(x = id, length.out = length(time)) } # make skeleton dataset with IDs and times @@ -335,7 +335,7 @@ folds_vfold_rolling_window_pooled <- function(n, t, id = NULL, time = NULL, message(paste("Processing", length(ids), "samples with", t, "time points.")) # establish V folds for cross-validating ids - Vfold_allocation <- sample(rep(seq_len(V), length = length(ids))) + Vfold_allocation <- sample(rep(x = seq_len(V), length.out = length(ids))) Vfolds_skeleton <- lapply(seq_len(V), fold_from_foldvec, Vfold_allocation) # establish rolling origin forecast for time-series cross-validation diff --git a/README.Rmd b/README.Rmd index 7de218c..353c802 100644 --- a/README.Rmd +++ b/README.Rmd @@ -14,8 +14,7 @@ knitr::opts_chunk$set( # R/`origami` -[![Travis-CI Build Status](https://travis-ci.org/tlverse/origami.svg?branch=master)](https://travis-ci.org/tlverse/origami) -[![Build status](https://ci.appveyor.com/api/projects/status/bfe2jd9a065jhql7?svg=true)](https://ci.appveyor.com/project/tlverse/origami) +[![R-CMD-check](https://github.com/tlverse/origami/workflows/R-CMD-check/badge.svg)](https://github.com/tlverse/origami/actions) [![Coverage Status](https://codecov.io/gh/tlverse/origami/branch/master/graph/badge.svg)](https://codecov.io/gh/tlverse/origami) [![CRAN](http://www.r-pkg.org/badges/version/origami)](http://www.r-pkg.org/pkg/origami) [![CRAN downloads](https://cranlogs.r-pkg.org/badges/origami)](https://CRAN.R-project.org/package=origami) @@ -34,11 +33,11 @@ Phillips](https://github.com/rachaelvphillips) --- -## Description +## What's `origami`? -`origami` is an R package that provides a general framework for the application -of cross-validation schemes to particular functions. By allowing arbitrary lists -of results, `origami` accommodates a range of cross-validation applications. +The `origami` R package provides a general framework for the application of +cross-validation schemes to particular functions. By allowing arbitrary lists of +results, `origami` accommodates a range of cross-validation applications. --- @@ -124,10 +123,8 @@ issue](https://github.com/tlverse/origami/issues). ## Contributions -It is our hope that `origami` will grow to be adopted as a backend for most any -procedure requiring cross-validation, including its integration into larger -machine learning frameworks. To that end, contributions are very welcome, though -we ask that interested contributors consult our [contribution +Contributions are very welcome. Interested contributors should consult our +[contribution guidelines](https://github.com/tlverse/origami/blob/master/CONTRIBUTING.md) prior to submitting a pull request. @@ -154,7 +151,7 @@ After using the `origami` R package, please cite it: ## License -© 2017-2020 [Jeremy R. Coyle](https://github.com/jeremyrcoyle) +© 2017-2021 [Jeremy R. Coyle](https://github.com/jeremyrcoyle) The contents of this repository are distributed under the GPL-3 license. See file `LICENSE` for details. diff --git a/README.md b/README.md index ed1cfbf..7024a1f 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,7 @@ # R/`origami` -[![Travis-CI Build -Status](https://travis-ci.org/tlverse/origami.svg?branch=master)](https://travis-ci.org/tlverse/origami) -[![Build -status](https://ci.appveyor.com/api/projects/status/bfe2jd9a065jhql7?svg=true)](https://ci.appveyor.com/project/tlverse/origami) +[![R-CMD-check](https://github.com/tlverse/origami/workflows/R-CMD-check/badge.svg)](https://github.com/tlverse/origami/actions) [![Coverage Status](https://codecov.io/gh/tlverse/origami/branch/master/graph/badge.svg)](https://codecov.io/gh/tlverse/origami) [![CRAN](http://www.r-pkg.org/badges/version/origami)](http://www.r-pkg.org/pkg/origami) @@ -32,11 +29,11 @@ Phillips](https://github.com/rachaelvphillips) ----- -## Description +## What’s `origami`? -`origami` is an R package that provides a general framework for the -application of cross-validation schemes to particular functions. By -allowing arbitrary lists of results, `origami` accommodates a range of +The `origami` R package provides a general framework for the application +of cross-validation schemes to particular functions. By allowing +arbitrary lists of results, `origami` accommodates a range of cross-validation applications. ----- @@ -78,7 +75,7 @@ estimate of the mean: ``` r library(stringr) library(origami) -#> origami v1.0.3: Generalized Framework for Cross-Validation +#> origami v1.0.4: Generalized Framework for Cross-Validation set.seed(4795) data(mtcars) @@ -90,7 +87,6 @@ head(mtcars) #> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 #> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 #> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 - # build a cv_fun that wraps around lm cv_lm <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula @@ -133,11 +129,8 @@ If you encounter any bugs or have any specific feature requests, please ## Contributions -It is our hope that `origami` will grow to be adopted as a backend for -most any procedure requiring cross-validation, including its integration -into larger machine learning frameworks. To that end, contributions are -very welcome, though we ask that interested contributors consult our -[contribution +Contributions are very welcome. Interested contributors should consult +our [contribution guidelines](https://github.com/tlverse/origami/blob/master/CONTRIBUTING.md) prior to submitting a pull request. @@ -166,7 +159,7 @@ After using the `origami` R package, please cite it: ## License -© 2017-2020 [Jeremy R. Coyle](https://github.com/jeremyrcoyle) +© 2017-2021 [Jeremy R. Coyle](https://github.com/jeremyrcoyle) The contents of this repository are distributed under the GPL-3 license. See file `LICENSE` for details. diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 31ba01e..0000000 --- a/appveyor.yml +++ /dev/null @@ -1,69 +0,0 @@ -# DO NOT CHANGE the "init" and "install" sections below - -# Download script file from GitHub -init: - ps: | - $ErrorActionPreference = "Stop" - Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" - Import-Module '..\appveyor-tool.ps1' -install: - ps: Bootstrap - -cache: - - C:\RLibrary -> appveyor.yml - -# Adapt as necessary starting from here -branches: - only: - - master - -environment: - global: - WARNINGS_ARE_ERRORS: 0 - R_ARCH: x64 - USE_RTOOLS: true - -build_script: - - travis-tool.sh install_deps - - travis-tool.sh install_github r-lib/covr - - travis-tool.sh install_github r-lib/sessioninfo - -test_script: - - travis-tool.sh run_tests - -on_failure: - - 7z a failure.zip *.Rcheck\* - - appveyor PushArtifact failure.zip - -on_success: - - Rscript -e "covr::codecov()" - -artifacts: - - path: '*.Rcheck\**\*.log' - name: Logs - - - path: '*.Rcheck\**\*.out' - name: Logs - - - path: '*.Rcheck\**\*.fail' - name: Logs - - - path: '*.Rcheck\**\*.Rout' - name: Logs - - - path: '\*_*.tar.gz' - name: Bits - - - path: '\*_*.zip' - name: Bits - -notifications: - - provider: Email - to: - - jeremyrcoyle@gmail.com - - nh@nimahejazi.org - subject: 'r-appveyor build {{status}}' - message: "https://ci.appveyor.com/project/tlverse/origami" - on_build_success: false - on_build_failure: true - on_build_status_changed: true diff --git a/cran-comments.md b/cran-comments.md index 323a2b0..fa9eacf 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,8 +1,7 @@ ## Test environments -* latest macOS (local), R 3.6.0 -* ubuntu 14.04 (on travis-ci), R 3.6.1 -* ubuntu 18.04 (local), R 3.6.2 -* windows (on appveyor-ci), R 3.6.1 +* ubuntu 20.04 (local + GitHub Actions), R 4.1.1 +* macOS 10.15 (local + GitHub Actions), R 4.1.1 +* windows 2019 (on GitHub Actions), R 4.1.1 ## R CMD check results There were no ERRORs, WARNINGs, or NOTEs diff --git a/docs/404.html b/docs/404.html index 340601d..ed5e6f3 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@ origami - 1.0.4 + 1.0.5 @@ -143,7 +143,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html index 7a8190e..ba4f732 100644 --- a/docs/CONTRIBUTING.html +++ b/docs/CONTRIBUTING.html @@ -81,7 +81,7 @@ origami - 1.0.4 + 1.0.5 @@ -206,7 +206,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 9be626c..22777f8 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -81,7 +81,7 @@ origami - 1.0.4 + 1.0.5 @@ -817,7 +817,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/articles/generalizedCV.html b/docs/articles/generalizedCV.html index 627a8bb..11b2944 100644 --- a/docs/articles/generalizedCV.html +++ b/docs/articles/generalizedCV.html @@ -37,7 +37,7 @@ origami - 1.0.4 + 1.0.5 @@ -79,7 +79,7 @@

Generalized Cross-Validation with Origami

Jeremy Coyle & Nima Hejazi

-

2020-07-09

+

2021-09-23

Source: vignettes/generalizedCV.Rmd @@ -103,8 +103,9 @@

Cross-validation with linear regression

We’ll start by examining a fairly simple data set:

-
data(mtcars)
-head(mtcars)
+
+data(mtcars)
+head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
 ## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
 ## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
@@ -113,8 +114,9 @@ 

## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 ## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1

One might be interested in examining how the efficiency of a car, as measured by miles-per-gallon (mpg), is explained by various technical aspects of the car, with data across a variety of different models of cars. Linear regression is perhaps the simplest statistical procedure that could be used to make such deductions. Let’s try it out:

-
lm_mod <- lm(mpg ~ ., data = mtcars)
-summary(lm_mod)
+
+lm_mod <- lm(mpg ~ ., data = mtcars)
+summary(lm_mod)
## 
 ## Call:
 ## lm(formula = mpg ~ ., data = mtcars)
@@ -143,45 +145,51 @@ 

## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066 ## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07

We can assess how well the model fits the data by comparing the predictions of the linear model to the true outcomes observed in the data set. This is the well known (and standard) mean squared error. We can extract that from the lm model object like so:

-
err <- mean(resid(lm_mod)^2)
+
+err <- mean(resid(lm_mod)^2)

The mean squared error is 4.6092009. There is an important problem that arises when we assess the model in this way – that is, we have trained our linear regression model on the full data set and assessed the error on the full data set, using up all of our data. We, of course, are generally not interested in how well the model explains variation in the observed data; rather, we are interested in how the explanation provided by the model generalizes to a target population from which the sample is presumably derived. Having used all of our available data, we cannot honestly evaluate how well the model fits (and thus explains) variation at the population level.

To resolve this issue, cross-validation allows for a particular procedure (e.g., linear regression) to be implemented over subsets of the data, evaluating how well the procedure fits on a testing (“validation”) set, thereby providing an honest evaluation of the error.

We can easily add cross-validation to our linear regression procedure using origami. First, let us define a new function to perform linear regression on a specific partition of the data (called a “fold”):

-
cv_lm <- function(fold, data, reg_form) {
+
+cv_lm <- function(fold, data, reg_form) {
   # get name and index of outcome variable from regression formula
-  out_var <- as.character(unlist(str_split(reg_form, " "))[1])
-  out_var_ind <- as.numeric(which(colnames(data) == out_var))
+  out_var <- as.character(unlist(str_split(reg_form, " "))[1])
+  out_var_ind <- as.numeric(which(colnames(data) == out_var))
 
   # split up data into training and validation sets
-  train_data <- training(data)
-  valid_data <- validation(data)
+  train_data <- training(data)
+  valid_data <- validation(data)
 
   # fit linear model on training set and predict on validation set
-  mod <- lm(as.formula(reg_form), data = train_data)
-  preds <- predict(mod, newdata = valid_data)
+  mod <- lm(as.formula(reg_form), data = train_data)
+  preds <- predict(mod, newdata = valid_data)
 
   # capture results to be returned as output
-  out <- list(coef = data.frame(t(coef(mod))),
-              SE = ((preds - valid_data[, out_var_ind])^2))
-  return(out)
-}
+ out <- list(coef = data.frame(t(coef(mod))), + SE = ((preds - valid_data[, out_var_ind])^2)) + return(out) +}

Our cv_lm function is rather simple: we merely split the available data into a training and validation sets, using the eponymous functions provided in origami, fit the linear model on the training set, and evaluate the model on the testing set. This is a simple example of what origami considers to be cv_funs – functions for using cross-validation to perform a particular routine over an input data set. Having defined such a function, we can simply generate a set of partitions using origami’s make_folds function, and apply our cv_lm function over the resultant folds object. Below, we replicate the resubstitution estimate of the error – we did this “by hand” above – using the functions make_folds and cv_lm.

-
library(origami)
-
## origami v1.0.4: Generalized Framework for Cross-Validation
-
library(stringr) # used in defining the cv_lm function above
-
# resubstitution estimate
-resub <- make_folds(mtcars, fold_fun = folds_resubstitution)[[1]]
-resub_results <- cv_lm(fold = resub, data = mtcars, reg_form = "mpg ~ .")
-mean(resub_results$SE)
+
+library(origami)
+
## origami v1.0.5: Generalized Framework for Cross-Validation
+
+library(stringr) # used in defining the cv_lm function above
+
+# resubstitution estimate
+resub <- make_folds(mtcars, fold_fun = folds_resubstitution)[[1]]
+resub_results <- cv_lm(fold = resub, data = mtcars, reg_form = "mpg ~ .")
+mean(resub_results$SE)
## [1] 4.609201

This (very nearly) matches the estimate of the error that we obtained above.

We can more honestly evaluate the error by V-fold cross-validation, which partitions the data into v subsets, fitting the model on \(v - 1\) of the subsets and evaluating on the subset that was held out for testing. This is repeated such that each subset is used for testing. We can easily apply our cv_lm function using origami’s cross_validate (n.b., by default this performs 10-fold cross-validation):

-
# cross-validated estimate
-folds <- make_folds(mtcars)
-cvlm_results <- cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars,
-                               reg_form = "mpg ~ .")
-mean(cvlm_results$SE)
-
## [1] 10.31819
+
+# cross-validated estimate
+folds <- make_folds(mtcars)
+cvlm_results <- cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars,
+                               reg_form = "mpg ~ .")
+mean(cvlm_results$SE)
+
## [1] 15.67379

Having performed 10-fold cross-validation, we quickly notice that our previous estimate of the model error (by resubstitution) was quite optimistic. The honest estimate of the error is several times larger.


@@ -213,34 +221,37 @@

Cross-validation with random forests

To examine origami further, let us return to our example analysis using the mtcars data set. Here, we will write a new cv_fun type object. As an example, we will use L. Breiman’s randomForest:

-
cv_rf <- function(fold, data, reg_form) {
+
+cv_rf <- function(fold, data, reg_form) {
   # get name and index of outcome variable from regression formula
-  out_var <- as.character(unlist(str_split(reg_form, " "))[1])
-  out_var_ind <- as.numeric(which(colnames(data) == out_var))
+  out_var <- as.character(unlist(str_split(reg_form, " "))[1])
+  out_var_ind <- as.numeric(which(colnames(data) == out_var))
 
   # define training and validation sets based on input object of class "folds"
-  train_data <- training(data)
-  valid_data <- validation(data)
+  train_data <- training(data)
+  valid_data <- validation(data)
 
   # fit Random Forest regression on training set and predict on holdout set
-  mod <- randomForest(formula = as.formula(reg_form), data = train_data)
-  preds <- predict(mod, newdata = valid_data)
+  mod <- randomForest(formula = as.formula(reg_form), data = train_data)
+  preds <- predict(mod, newdata = valid_data)
 
   # define output object to be returned as list (for flexibility)
-  out <- list(coef = data.frame(mod$coefs),
-              SE = ((preds - valid_data[, out_var_ind])^2))
-  return(out)
-}
+ out <- list(coef = data.frame(mod$coefs), + SE = ((preds - valid_data[, out_var_ind])^2)) + return(out) +}

Above, in writing our cv_rf function to cross-validate randomForest, we used our previous function cv_lm as an example. For now, individual cv_funs must be written by hand; however, in future releases, a wrapper may be available to support auto-generating cv_funs to be used with origami.

Below, we use cross_validate to apply our new cv_rf function over the folds object generated by make_folds.

-
library(randomForest)
+
+library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
-
folds <- make_folds(mtcars)
-cvrf_results <- cross_validate(cv_fun = cv_rf, folds = folds, data = mtcars,
-                               reg_form = "mpg ~ .")
-mean(cvrf_results$SE)
-
## [1] 6.149974
+
+folds <- make_folds(mtcars)
+cvrf_results <- cross_validate(cv_fun = cv_rf, folds = folds, data = mtcars,
+                               reg_form = "mpg ~ .")
+mean(cvrf_results$SE)
+
## [1] 5.308908

Using 10-fold cross-validation (the default), we obtain an honest estimate of the prediction error of random forests. From this, we gather that the use of origami’s cross_validate procedure can be generalized to arbitrary esimation techniques, given availability of an appropriate cv_fun function.


@@ -248,8 +259,9 @@

Cross-validation with dependence: time series

Cross-validation can also be used for forecast model selection in a time series setting. Here, the partitioning scheme mirrors the application of the forecasting model: We’ll train the data on past observations (either all available or a recent subset), and then use the model forecast (predict), the next few observations. Consider the AirPassengers dataset, a monthly time series of passenger air traffic in thousands of people.

-
data(AirPassengers)
-print(AirPassengers)
+
+data(AirPassengers)
+print(AirPassengers)
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
 ## 1949 112 118 132 129 121 135 148 148 136 119 104 118
 ## 1950 115 126 141 135 125 149 170 170 158 133 114 140
@@ -264,44 +276,46 @@ 

## 1959 360 342 406 396 420 472 548 559 463 407 362 405 ## 1960 417 391 419 461 472 535 622 606 508 461 390 432

Suppose we want to pick between two forecasting models, stl, and arima (the details of these models are not important for this example). We can do that by evaluating their forecasting performance.

-
library(forecast)
+
+library(forecast)
## Registered S3 method overwritten by 'quantmod':
 ##   method            from
 ##   as.zoo.data.frame zoo
-
folds = make_folds(AirPassengers, fold_fun=folds_rolling_origin,
-                   first_window = 36, validation_size = 24)
-fold = folds[[1]]
+
+folds = make_folds(AirPassengers, fold_fun=folds_rolling_origin,
+                   first_window = 36, validation_size = 24)
+fold = folds[[1]]
 
 # function to calculate cross-validated squared error
-cv_forecasts <- function(fold, data) {
-  train_data <- training(data)
-  valid_data <- validation(data)
-  valid_size <- length(valid_data)
+cv_forecasts <- function(fold, data) {
+  train_data <- training(data)
+  valid_data <- validation(data)
+  valid_size <- length(valid_data)
 
-  train_ts <- ts(log10(train_data), frequency = 12)
+  train_ts <- ts(log10(train_data), frequency = 12)
 
   # borrowed from AirPassengers help
-  arima_fit <- arima(train_ts, c(0, 1, 1),
-                     seasonal = list(order = c(0, 1, 1),
-                                     period = 12))
-  raw_arima_pred <- predict(arima_fit, n.ahead = valid_size)
-  arima_pred <- 10^raw_arima_pred$pred
-  arima_MSE <- mean((arima_pred - valid_data)^2)
+  arima_fit <- arima(train_ts, c(0, 1, 1),
+                     seasonal = list(order = c(0, 1, 1),
+                                     period = 12))
+  raw_arima_pred <- predict(arima_fit, n.ahead = valid_size)
+  arima_pred <- 10^raw_arima_pred$pred
+  arima_MSE <- mean((arima_pred - valid_data)^2)
 
   # stl model
-  stl_fit <- stlm(train_ts, s.window = 12)
-  raw_stl_pred = forecast(stl_fit, h = valid_size)
-  stl_pred <- 10^raw_stl_pred$mean
-  stl_MSE <- mean((stl_pred - valid_data)^2)
+  stl_fit <- stlm(train_ts, s.window = 12)
+  raw_stl_pred = forecast(stl_fit, h = valid_size)
+  stl_pred <- 10^raw_stl_pred$mean
+  stl_MSE <- mean((stl_pred - valid_data)^2)
 
-  out <- list(mse = data.frame(fold = fold_index(),
-                               arima = arima_MSE, stl = stl_MSE))
-  return(out)
-}
+  out <- list(mse = data.frame(fold = fold_index(),
+                               arima = arima_MSE, stl = stl_MSE))
+  return(out)
+}
 
-mses = cross_validate(cv_fun = cv_forecasts, folds = folds,
-                      data = AirPassengers)$mse
-colMeans(mses[, c("arima", "stl")])
+mses = cross_validate(cv_fun = cv_forecasts, folds = folds, + data = AirPassengers)$mse +colMeans(mses[, c("arima", "stl")])
##    arima      stl 
 ## 667.2477 925.7137

@@ -309,13 +323,13 @@

Session Information

-
## R version 3.6.3 (2020-02-29)
+
## R version 4.1.1 (2021-08-10)
 ## Platform: x86_64-pc-linux-gnu (64-bit)
-## Running under: Ubuntu 18.04.4 LTS
+## Running under: Ubuntu 20.04.3 LTS
 ## 
 ## Matrix products: default
-## BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
-## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
+## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
+## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
 ## 
 ## locale:
 ##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
@@ -329,26 +343,29 @@ 

## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: -## [1] forecast_8.12 randomForest_4.6-14 stringr_1.4.0 -## [4] origami_1.0.4 +## [1] forecast_8.15 randomForest_4.6-14 stringr_1.4.0 +## [4] origami_1.0.5 ## ## loaded via a namespace (and not attached): -## [1] zoo_1.8-8 tidyselect_1.1.0 xfun_0.15 urca_1.3-0 -## [5] purrr_0.3.4 listenv_0.8.0 lattice_0.20-41 colorspace_1.4-1 -## [9] vctrs_0.3.1 generics_0.0.2 htmltools_0.5.0 yaml_2.2.1 -## [13] rlang_0.4.6 pkgdown_1.5.1 pillar_1.4.4 glue_1.4.1 -## [17] TTR_0.23-6 lifecycle_0.2.0 quantmod_0.4.17 timeDate_3043.102 -## [21] munsell_0.5.0 gtable_0.3.0 future_1.17.0 codetools_0.2-16 -## [25] memoise_1.1.0 evaluate_0.14 knitr_1.29 tseries_0.10-47 -## [29] lmtest_0.9-37 curl_4.3 parallel_3.6.3 xts_0.12-0 -## [33] Rcpp_1.0.5 scales_1.1.1 backports_1.1.8 desc_1.2.0 -## [37] abind_1.4-5 fs_1.4.2 fracdiff_1.5-1 ggplot2_3.3.2 -## [41] digest_0.6.25 stringi_1.4.6 dplyr_1.0.0 grid_3.6.3 -## [45] rprojroot_1.3-2 quadprog_1.5-8 tools_3.6.3 magrittr_1.5 -## [49] tibble_3.0.1 crayon_1.3.4 future.apply_1.6.0 pkgconfig_2.0.3 -## [53] MASS_7.3-51.6 ellipsis_0.3.1 data.table_1.12.8 assertthat_0.2.1 -## [57] rmarkdown_2.3 R6_2.4.1 globals_0.12.5 nlme_3.1-148 -## [61] nnet_7.3-14 compiler_3.6.3

+## [1] Rcpp_1.0.7 lattice_0.20-45 listenv_0.8.0 zoo_1.8-9 +## [5] assertthat_0.2.1 rprojroot_2.0.2 digest_0.6.28 lmtest_0.9-38 +## [9] utf8_1.2.2 parallelly_1.28.1 R6_2.5.1 evaluate_0.14 +## [13] ggplot2_3.3.5 pillar_1.6.2 rlang_0.4.11 curl_4.3.2 +## [17] data.table_1.14.0 TTR_0.24.2 fracdiff_1.5-1 jquerylib_0.1.4 +## [21] rmarkdown_2.11 pkgdown_1.6.1 textshaping_0.3.5 desc_1.3.0 +## [25] munsell_0.5.0 compiler_4.1.1 xfun_0.26 pkgconfig_2.0.3 +## [29] systemfonts_1.0.2 urca_1.3-0 globals_0.14.0 htmltools_0.5.2 +## [33] nnet_7.3-16 tidyselect_1.1.1 tibble_3.1.4 quadprog_1.5-8 +## [37] codetools_0.2-18 fansi_0.5.0 future_1.22.1 crayon_1.4.1 +## [41] dplyr_1.0.7 grid_4.1.1 nlme_3.1-153 jsonlite_1.7.2 +## [45] gtable_0.3.0 lifecycle_1.0.0 DBI_1.1.1 magrittr_2.0.1 +## [49] scales_1.1.1 quantmod_0.4.18 future.apply_1.8.1 stringi_1.7.4 +## [53] cachem_1.0.6 tseries_0.10-48 fs_1.5.0 timeDate_3043.102 +## [57] bslib_0.3.0 xts_0.12.1 ellipsis_0.3.2 ragg_1.1.3 +## [61] vctrs_0.3.8 generics_0.1.0 tools_4.1.1 glue_1.4.2 +## [65] purrr_0.3.4 abind_1.4-5 parallel_4.1.1 fastmap_1.1.0 +## [69] yaml_2.2.1 colorspace_2.0-2 memoise_2.0.0 knitr_1.34 +## [73] sass_0.4.0

@@ -380,7 +397,7 @@

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/articles/index.html b/docs/articles/index.html index 6e09ea8..43d9c94 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -81,7 +81,7 @@ origami - 1.0.4 + 1.0.5 @@ -142,7 +142,7 @@

All vignettes

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/authors.html b/docs/authors.html index c63e90c..6ce1dab 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -81,7 +81,7 @@ origami - 1.0.4 + 1.0.5 @@ -154,7 +154,7 @@

Authors

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/index.html b/docs/index.html index b12898d..41e29d6 100644 --- a/docs/index.html +++ b/docs/index.html @@ -39,7 +39,7 @@ origami - 1.0.4 + 1.0.5 @@ -77,9 +77,9 @@
-
+
@@ -87,19 +87,21 @@

Authors: Jeremy Coyle, Nima Hejazi, Ivana Malenica, and Rachael Phillips


-
+

-Description

-

origami is an R package that provides a general framework for the application of cross-validation schemes to particular functions. By allowing arbitrary lists of results, origami accommodates a range of cross-validation applications.

+What’s origami? +

The origami R package provides a general framework for the application of cross-validation schemes to particular functions. By allowing arbitrary lists of results, origami accommodates a range of cross-validation applications.


Installation

For standard use, we recommend installing the package from CRAN via

-
install.packages("origami")
+
+install.packages("origami")

You can install a stable release of origami from GitHub via devtools with:

-
devtools::install_github("tlverse/origami")
+
+devtools::install_github("tlverse/origami")

@@ -112,13 +114,14 @@

Example

This minimal example shows how to use origami to apply cross-validation to the computation of a simple descriptive statistic using a sample data set. In particular, we obtain a cross-validated estimate of the mean:

-
library(stringr)
-library(origami)
+
+library(stringr)
+library(origami)
 #> origami v1.0.4: Generalized Framework for Cross-Validation
-set.seed(4795)
+set.seed(4795)
 
-data(mtcars)
-head(mtcars)
+data(mtcars)
+head(mtcars)
 #>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
 #> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
 #> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
@@ -126,32 +129,31 @@ 

#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 #> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 #> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 - # build a cv_fun that wraps around lm -cv_lm <- function(fold, data, reg_form) { +cv_lm <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula - out_var <- as.character(unlist(str_split(reg_form, " "))[1]) - out_var_ind <- as.numeric(which(colnames(data) == out_var)) + out_var <- as.character(unlist(str_split(reg_form, " "))[1]) + out_var_ind <- as.numeric(which(colnames(data) == out_var)) # split up data into training and validation sets - train_data <- training(data) - valid_data <- validation(data) + train_data <- training(data) + valid_data <- validation(data) # fit linear model on training set and predict on validation set - mod <- lm(as.formula(reg_form), data = train_data) - preds <- predict(mod, newdata = valid_data) + mod <- lm(as.formula(reg_form), data = train_data) + preds <- predict(mod, newdata = valid_data) # capture results to be returned as output - out <- list(coef = data.frame(t(coef(mod))), - SE = ((preds - valid_data[, out_var_ind])^2)) - return(out) -} + out <- list(coef = data.frame(t(coef(mod))), + SE = ((preds - valid_data[, out_var_ind])^2)) + return(out) +} -folds <- make_folds(mtcars) -results <- cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars, - reg_form = "mpg ~ .") -mean(results$SE) -#> [1] 15.18558

+folds <- make_folds(mtcars) +results <- cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars, + reg_form = "mpg ~ .") +mean(results$SE) +#> [1] 15.18558

For details on how to write wrappers (cv_funs) for use with origami::cross_validate, please consult the documentation and vignettes that accompany the package.


@@ -164,7 +166,7 @@

Contributions

-

It is our hope that origami will grow to be adopted as a backend for most any procedure requiring cross-validation, including its integration into larger machine learning frameworks. To that end, contributions are very welcome, though we ask that interested contributors consult our contribution guidelines prior to submitting a pull request.

+

Contributions are very welcome. Interested contributors should consult our contribution guidelines prior to submitting a pull request.


@@ -188,7 +190,7 @@

License

-

© 2017-2020 Jeremy R. Coyle

+

© 2017-2021 Jeremy R. Coyle

The contents of this repository are distributed under the GPL-3 license. See file LICENSE for details.

@@ -231,11 +233,11 @@

Developers

Dev status

    -
  • Travis-CI Build Status
  • -
  • Build status
  • +
  • R-CMD-check
  • Coverage Status
  • CRAN
  • CRAN downloads
  • +
  • CRAN total downloads
  • Project Status: Active - The project has reached a stable, usable state and is being actively developed.
  • License: GPL v3
  • DOI
  • @@ -251,7 +253,7 @@

    Dev status

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/news/index.html b/docs/news/index.html index 6a9b157..ba154e4 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -81,7 +81,7 @@ origami - 1.0.4 + 1.0.5
@@ -124,45 +124,54 @@

Changelog

Source: NEWS.md
-
+
+

+origami 1.0.5 Unreleased +

+ +
+

-origami 1.0.4 Unreleased +origami 1.0.4 Unreleased

  • Fix the incorrect use of future.apply by including future.seed = TRUE, as per https://github.com/tlverse/origami/issues/48.
  • Fixes to folds_rolling_origin_pooled and folds_rolling_window_pooled, as per https://github.com/tlverse/origami/pull/50.
  • Sped up all pooled time-series fold functions with lapply.
  • -
  • All pooled time-series fold functions (folds_rolling_origin_pooled, folds_rolling_window_pooled, folds_vfold_rolling_origin_pooled, folds_vfold_rolling_window_pooled) now allow for variability in the number of observations for each independent unit (i.e. subject).
  • +
  • All pooled time-series fold functions (folds_rolling_origin_pooled, folds_rolling_window_pooled, folds_vfold_rolling_origin_pooled, folds_vfold_rolling_window_pooled) now allow for variability in the number of observations for each independent unit (i.e., the subject).
  • Added test for pooled time-series cross-validation with multi-unit time-series to test the update above. In this test, the total number of observations and the time-points for which there are observations varies across the units.
-
+

-origami 1.0.3 2020-01-16 +origami 1.0.3 2020-01-16

  • A maintenance release addressing reported issues, including changes to unit tests that relied on Suggested packages dependencies.
-
+

-origami 1.0.2 Unreleased +origami 1.0.2 Unreleased

  • Adds new functionality for cross-validation with time-series, especially functionality for pooling time-series data. This includes the new functions folds_rolling_origin_pooled, folds_rolling_window_pooled, folds_vfold_rolling_origin_pooled, folds_vfold_rolling_window_pooled.
-
+

-origami 1.0.1 2019-05-01 +origami 1.0.1 2019-05-01

  • Adds a simple check of the current R version in the unit tests in the file tests/testthat/test-overall.R to use appropriate hard-coded values that depend on the R version. Note that these differ between R < 3.6.0 and R > 3.6.0 due to an important change in the default PRNG.
-
+

-origami 1.0.0 2018-03-06 +origami 1.0.0 2018-03-06

  • Adds a new function folds2foldvec for easy conversion between the folds structure used by origami and other packages (e.g., glmnet).
  • @@ -170,9 +179,9 @@

  • Calls future_lapply from the new more modular package future.apply rather than from future, matching the author’s recommendation.
-
+

-origami 0.8.0 2017-06-23 +origami 0.8.0 2017-06-23

  • First CRAN release.
  • @@ -196,7 +205,7 @@

    Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/pkgdown.css b/docs/pkgdown.css index c01e592..1273238 100644 --- a/docs/pkgdown.css +++ b/docs/pkgdown.css @@ -244,14 +244,14 @@ nav[data-toggle='toc'] .nav .nav > .active:focus > a { .ref-index th {font-weight: normal;} -.ref-index td {vertical-align: top;} +.ref-index td {vertical-align: top; min-width: 100px} .ref-index .icon {width: 40px;} .ref-index .alias {width: 40%;} .ref-index-icons .alias {width: calc(40% - 40px);} .ref-index .title {width: 60%;} .ref-arguments th {text-align: right; padding-right: 10px;} -.ref-arguments th, .ref-arguments td {vertical-align: top;} +.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} .ref-arguments .name {width: 20%;} .ref-arguments .desc {width: 80%;} diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index c1b0ef6..0287669 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,9 +1,9 @@ -pandoc: 2.2.1 -pkgdown: 1.5.1 +pandoc: '2.5' +pkgdown: 1.6.1 pkgdown_sha: ~ articles: generalizedCV: generalizedCV.html -last_built: 2020-07-09T19:43Z +last_built: 2021-09-23T20:30Z urls: reference: http://tlverse.org/origami/reference article: http://tlverse.org/origami/articles diff --git a/docs/reference/Rplot001.png b/docs/reference/Rplot001.png new file mode 100644 index 0000000..17a3580 Binary files /dev/null and b/docs/reference/Rplot001.png differ diff --git a/docs/reference/check_id_and_time.html b/docs/reference/check_id_and_time.html index 9e07ba3..4590be1 100644 --- a/docs/reference/check_id_and_time.html +++ b/docs/reference/check_id_and_time.html @@ -82,7 +82,7 @@ origami - 1.0.4 + 1.0.5
@@ -130,7 +130,7 @@

Check ID and Time Compatibility

Check ID and Time Compatibility

-
check_id_and_time(id, time)
+
check_id_and_time(id, time)

Arguments

@@ -163,7 +163,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/combine_results.html b/docs/reference/combine_results.html index d637bda..e9580f7 100644 --- a/docs/reference/combine_results.html +++ b/docs/reference/combine_results.html @@ -83,7 +83,7 @@ origami - 1.0.4 + 1.0.5 @@ -132,7 +132,7 @@

Combine Results from Different Folds

similarly structured results, to a list of such lists.

-
combine_results(results, combiners = NULL, smart_combiners = TRUE)
+
combine_results(results, combiners = NULL, smart_combiners = TRUE)

Arguments

@@ -181,7 +181,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/combiners.html b/docs/reference/combiners.html index 47767c0..a74880c 100644 --- a/docs/reference/combiners.html +++ b/docs/reference/combiners.html @@ -83,7 +83,7 @@ origami - 1.0.4 + 1.0.5 @@ -132,13 +132,13 @@

Combiners

results. These are standard idioms for combining lists of certain data types.

-
combiner_rbind(x)
+    
combiner_rbind(x)
 
-combiner_c(x)
+combiner_c(x)
 
-combiner_factor(x)
+combiner_factor(x)
 
-combiner_array(x)
+combiner_array(x)

Arguments

@@ -168,7 +168,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/cross_validate.html b/docs/reference/cross_validate.html index 8840b72..f0ee5e2 100644 --- a/docs/reference/cross_validate.html +++ b/docs/reference/cross_validate.html @@ -83,7 +83,7 @@ origami - 1.0.4 + 1.0.5 @@ -132,15 +132,15 @@

Main Cross-Validation Function

the results across folds using combine_results.

-
cross_validate(
-  cv_fun,
-  folds,
-  ...,
-  use_future = TRUE,
-  .combine = TRUE,
-  .combine_control = list(),
-  .old_results = NULL
-)
+
cross_validate(
+  cv_fun,
+  folds,
+  ...,
+  use_future = TRUE,
+  .combine = TRUE,
+  .combine_control = list(),
+  .old_results = NULL
+)

Arguments

@@ -150,7 +150,7 @@

Arg

@@ -165,7 +165,7 @@

Arg

@@ -194,95 +194,101 @@

Examp
############################################################################### # This example explains how to use the cross_validate function naively. ############################################################################### -data(mtcars) +data(mtcars) # resubstitution MSE -r <- lm(mpg ~ ., data = mtcars) -mean(resid(r)^2)
#> [1] 4.609201
+r <- lm(mpg ~ ., data = mtcars) +mean(resid(r)^2) +
#> [1] 4.609201
# function to calculate cross-validated squared error -cv_lm <- function(fold, data, reg_form) { +cv_lm <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula - out_var <- as.character(unlist(stringr::str_split(reg_form, " "))[1]) - out_var_ind <- as.numeric(which(colnames(data) == out_var)) + out_var <- as.character(unlist(stringr::str_split(reg_form, " "))[1]) + out_var_ind <- as.numeric(which(colnames(data) == out_var)) # split up data into training and validation sets - train_data <- training(data) - valid_data <- validation(data) + train_data <- training(data) + valid_data <- validation(data) # fit linear model on training set and predict on validation set - mod <- lm(as.formula(reg_form), data = train_data) - preds <- predict(mod, newdata = valid_data) + mod <- lm(as.formula(reg_form), data = train_data) + preds <- predict(mod, newdata = valid_data) # capture results to be returned as output - out <- list( - coef = data.frame(t(coef(mod))), - SE = ((preds - valid_data[, out_var_ind])^2) - ) - return(out) -} + out <- list( + coef = data.frame(t(coef(mod))), + SE = ((preds - valid_data[, out_var_ind])^2) + ) + return(out) +} # replicate the resubstitution estimate -resub <- make_folds(mtcars, fold_fun = folds_resubstitution)[[1]] -resub_results <- cv_lm(fold = resub, data = mtcars, reg_form = "mpg ~ .") -mean(resub_results$SE)
#> [1] 4.609201
+resub <- make_folds(mtcars, fold_fun = folds_resubstitution)[[1]] +resub_results <- cv_lm(fold = resub, data = mtcars, reg_form = "mpg ~ .") +mean(resub_results$SE) +
#> [1] 4.609201
# cross-validated estimate -folds <- make_folds(mtcars) -cv_results <- cross_validate( - cv_fun = cv_lm, folds = folds, data = mtcars, - reg_form = "mpg ~ ." -) -mean(cv_results$SE)
#> [1] 12.78283
############################################################################### +folds <- make_folds(mtcars) +cv_results <- cross_validate( + cv_fun = cv_lm, folds = folds, data = mtcars, + reg_form = "mpg ~ ." +) +mean(cv_results$SE) +
#> [1] 12.78283
############################################################################### # This example explains how to use the cross_validate function with # parallelization using the framework of the future package. ############################################################################### -suppressMessages(library(data.table)) -library(future) -data(mtcars) -set.seed(1) +suppressMessages(library(data.table)) +library(future) +data(mtcars) +set.seed(1) # make a lot of folds -folds <- make_folds(mtcars, fold_fun = folds_bootstrap, V = 1000) +folds <- make_folds(mtcars, fold_fun = folds_bootstrap, V = 1000) # function to calculate cross-validated squared error for linear regression -cv_lm <- function(fold, data, reg_form) { +cv_lm <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula - out_var <- as.character(unlist(str_split(reg_form, " "))[1]) - out_var_ind <- as.numeric(which(colnames(data) == out_var)) + out_var <- as.character(unlist(str_split(reg_form, " "))[1]) + out_var_ind <- as.numeric(which(colnames(data) == out_var)) # split up data into training and validation sets - train_data <- training(data) - valid_data <- validation(data) + train_data <- training(data) + valid_data <- validation(data) # fit linear model on training set and predict on validation set - mod <- lm(as.formula(reg_form), data = train_data) - preds <- predict(mod, newdata = valid_data) + mod <- lm(as.formula(reg_form), data = train_data) + preds <- predict(mod, newdata = valid_data) # capture results to be returned as output - out <- list( - coef = data.frame(t(coef(mod))), - SE = ((preds - valid_data[, out_var_ind])^2) - ) - return(out) -} - -plan(sequential) -time_seq <- system.time({ - results_seq <- cross_validate( - cv_fun = cv_lm, folds = folds, data = mtcars, - reg_form = "mpg ~ ." - ) -})
#> Warning: All iterations resulted in errors
-plan(multicore) -time_mc <- system.time({ - results_mc <- cross_validate( - cv_fun = cv_lm, folds = folds, data = mtcars, - reg_form = "mpg ~ ." - ) -})
#> Warning: All iterations resulted in errors
-if (availableCores() > 1) { - time_mc["elapsed"] < 1.2 * time_seq["elapsed"] -}
#> elapsed + out <- list( + coef = data.frame(t(coef(mod))), + SE = ((preds - valid_data[, out_var_ind])^2) + ) + return(out) +} + +plan(sequential) +time_seq <- system.time({ + results_seq <- cross_validate( + cv_fun = cv_lm, folds = folds, data = mtcars, + reg_form = "mpg ~ ." + ) +}) +
#> Warning: All iterations resulted in errors
+plan(multicore) +time_mc <- system.time({ + results_mc <- cross_validate( + cv_fun = cv_lm, folds = folds, data = mtcars, + reg_form = "mpg ~ ." + ) +}) +
#> Warning: All iterations resulted in errors
+if (availableCores() > 1) { + time_mc["elapsed"] < 1.2 * time_seq["elapsed"] +} +
#> elapsed #> FALSE
-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/fold_from_foldvec.html b/docs/reference/fold_from_foldvec.html index 9486197..29b6be7 100644 --- a/docs/reference/fold_from_foldvec.html +++ b/docs/reference/fold_from_foldvec.html @@ -83,7 +83,7 @@ origami - 1.0.4 + 1.0.5 @@ -132,7 +132,7 @@

Build a Fold Object from a Fold Vector

IDs) and builds a fold object for fold V.

-
fold_from_foldvec(v, folds)
+
fold_from_foldvec(v, folds)

Arguments

A function that takes a 'fold' as it's first argument and returns a list of results from that fold. NOTE: the use of an argument named 'X' is specifically disallowed in any input function for compliance -with the functions future_lapply and +with the functions future_lapply and lapply.

use_future

A logical option for whether to run the main loop -of cross-validation with future_lapply or with +of cross-validation with future_lapply or with lapply.

@@ -172,7 +172,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/fold_funs.html b/docs/reference/fold_funs.html index 84c4e7d..e4bdd82 100644 --- a/docs/reference/fold_funs.html +++ b/docs/reference/fold_funs.html @@ -86,7 +86,7 @@ origami - 1.0.4 + 1.0.5 @@ -138,65 +138,65 @@

Cross-Validation Schemes

remaining arguments (e.g. V or pvalidation) on.

-
folds_vfold(n, V = 10L)
-
-folds_resubstitution(n)
-
-folds_loo(n)
-
-folds_montecarlo(n, V = 1000L, pvalidation = 0.2)
-
-folds_bootstrap(n, V = 1000L)
-
-folds_rolling_origin(n, first_window, validation_size, gap = 0L, batch = 1L)
-
-folds_rolling_window(n, window_size, validation_size, gap = 0L, batch = 1L)
-
-folds_rolling_origin_pooled(
-  n,
-  t,
-  id = NULL,
-  time = NULL,
-  first_window,
-  validation_size,
-  gap = 0L,
-  batch = 1L
-)
-
-folds_rolling_window_pooled(
-  n,
-  t,
-  id = NULL,
-  time = NULL,
-  window_size,
-  validation_size,
-  gap = 0L,
-  batch = 1L
-)
-
-folds_vfold_rolling_origin_pooled(
-  n,
-  t,
-  id = NULL,
-  time = NULL,
-  V = 10L,
-  first_window,
-  validation_size,
-  gap = 0L,
-  batch = 1L
-)
-
-folds_vfold_rolling_window_pooled(
-  n,
-  t,
-  id = NULL,
-  time = NULL,
-  V = 10L,
-  window_size,
-  validation_size,
-  gap = 0L,
-  batch = 1L
-)
+
folds_vfold(n, V = 10L)
+
+folds_resubstitution(n)
+
+folds_loo(n)
+
+folds_montecarlo(n, V = 1000L, pvalidation = 0.2)
+
+folds_bootstrap(n, V = 1000L)
+
+folds_rolling_origin(n, first_window, validation_size, gap = 0L, batch = 1L)
+
+folds_rolling_window(n, window_size, validation_size, gap = 0L, batch = 1L)
+
+folds_rolling_origin_pooled(
+  n,
+  t,
+  id = NULL,
+  time = NULL,
+  first_window,
+  validation_size,
+  gap = 0L,
+  batch = 1L
+)
+
+folds_rolling_window_pooled(
+  n,
+  t,
+  id = NULL,
+  time = NULL,
+  window_size,
+  validation_size,
+  gap = 0L,
+  batch = 1L
+)
+
+folds_vfold_rolling_origin_pooled(
+  n,
+  t,
+  id = NULL,
+  time = NULL,
+  V = 10L,
+  first_window,
+  validation_size,
+  gap = 0L,
+  batch = 1L
+)
+
+folds_vfold_rolling_window_pooled(
+  n,
+  t,
+  id = NULL,
+  time = NULL,
+  V = 10L,
+  window_size,
+  validation_size,
+  gap = 0L,
+  batch = 1L
+)

Arguments

@@ -283,7 +283,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/fold_helpers.html b/docs/reference/fold_helpers.html index 12a5bf1..e528b58 100644 --- a/docs/reference/fold_helpers.html +++ b/docs/reference/fold_helpers.html @@ -82,7 +82,7 @@ origami - 1.0.4 + 1.0.5 @@ -130,11 +130,11 @@

Fold Helpers

Accessors and indexers for the different parts of a fold.

-
training(x = NULL, fold = NULL)
+    
training(x = NULL, fold = NULL)
 
-validation(x = NULL, fold = NULL)
+validation(x = NULL, fold = NULL)
 
-fold_index(x = NULL, fold = NULL)
+fold_index(x = NULL, fold = NULL)

Arguments

@@ -174,7 +174,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/folds2foldvec.html b/docs/reference/folds2foldvec.html index 3ac6adc..804419f 100644 --- a/docs/reference/folds2foldvec.html +++ b/docs/reference/folds2foldvec.html @@ -84,7 +84,7 @@ origami - 1.0.4 + 1.0.5 @@ -131,10 +131,10 @@

Build a Fold Vector from a Fold Object

For V-fold type cross-validation. This takes a fold object and returns a fold vector (containing the validation set IDs) for use with other tools like -cv.glmnet.

+cv.glmnet.

-
folds2foldvec(folds)
+
folds2foldvec(folds)

Arguments

@@ -170,7 +170,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/guess_combiner.html b/docs/reference/guess_combiner.html index 6ff36ef..cf7c968 100644 --- a/docs/reference/guess_combiner.html +++ b/docs/reference/guess_combiner.html @@ -82,7 +82,7 @@ origami - 1.0.4 + 1.0.5 @@ -130,7 +130,7 @@

Flexible Guessing and Mapping for Combining Data Types

Maps data types into standard combiners that should be sensible.

-
guess_combiner(result)
+
guess_combiner(result)

Arguments

@@ -160,7 +160,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/id_folds_to_folds.html b/docs/reference/id_folds_to_folds.html index d40b3e6..55838de 100644 --- a/docs/reference/id_folds_to_folds.html +++ b/docs/reference/id_folds_to_folds.html @@ -83,7 +83,7 @@ origami - 1.0.4 + 1.0.5 @@ -132,7 +132,7 @@

Convert ID Folds to Observation Folds

observations

-
id_folds_to_folds(idfolds, cluster_ids)
+
id_folds_to_folds(idfolds, cluster_ids)

Arguments

@@ -164,7 +164,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/index.html b/docs/reference/index.html index 725d166..4553305 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -81,7 +81,7 @@ origami - 1.0.4 + 1.0.5 @@ -240,7 +240,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/make_fold.html b/docs/reference/make_fold.html index 7a40a2a..2eec985 100644 --- a/docs/reference/make_fold.html +++ b/docs/reference/make_fold.html @@ -82,7 +82,7 @@ origami - 1.0.4 + 1.0.5 @@ -130,7 +130,7 @@

Fold

Functions to make a fold. Current representation is a simple list.

-
make_fold(v, training_set, validation_set)
+
make_fold(v, training_set, validation_set)

Arguments

@@ -173,7 +173,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/make_folds.html b/docs/reference/make_folds.html index 8004f5b..14d2c66 100644 --- a/docs/reference/make_folds.html +++ b/docs/reference/make_folds.html @@ -82,7 +82,7 @@ origami - 1.0.4 + 1.0.5 @@ -130,13 +130,13 @@

Make List of Folds for cross-validation

Generates a list of folds for a variety of cross-validation schemes.

-
make_folds(
-  n = NULL,
-  fold_fun = folds_vfold,
-  cluster_ids = NULL,
-  strata_ids = NULL,
-  ...
-)
+
make_folds(
+  n = NULL,
+  fold_fun = folds_vfold,
+  cluster_ids = NULL,
+  strata_ids = NULL,
+  ...
+)

Arguments

@@ -199,7 +199,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/make_repeated_folds.html b/docs/reference/make_repeated_folds.html index 1ab7643..84ac290 100644 --- a/docs/reference/make_repeated_folds.html +++ b/docs/reference/make_repeated_folds.html @@ -84,7 +84,7 @@ origami - 1.0.4 + 1.0.5 @@ -134,7 +134,7 @@

Repeated Cross-Validation

make_folds and concatenating the results.

-
make_repeated_folds(repeats, ...)
+
make_repeated_folds(repeats, ...)

Arguments

@@ -172,7 +172,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/docs/reference/wrap_in_try.html b/docs/reference/wrap_in_try.html index beb2b16..b04923b 100644 --- a/docs/reference/wrap_in_try.html +++ b/docs/reference/wrap_in_try.html @@ -82,7 +82,7 @@ origami - 1.0.4 + 1.0.5 @@ -130,7 +130,7 @@

Wrap a Function in a Try Statement

Function factory that generates versions of functions wrapped in try.

-
wrap_in_try(fun, ...)
+
wrap_in_try(fun, ...)

Arguments

@@ -161,7 +161,7 @@

Contents

-

Site built with pkgdown 1.5.1.

+

Site built with pkgdown 1.6.1.

diff --git a/tests/testthat/test-overall_timeseries.R b/tests/testthat/test-overall_timeseries.R index 649f2e3..5f574c6 100644 --- a/tests/testthat/test-overall_timeseries.R +++ b/tests/testthat/test-overall_timeseries.R @@ -145,7 +145,6 @@ if (require("forecast")) { folds_id1 <- make_folds(test_data_id, fold_fun = folds_rolling_origin_pooled, - t = 60, first_window = 10, id = 1, time = seq(1:60), validation_size = 5, gap = 0, batch = 20 )