diff --git a/.Rbuildignore b/.Rbuildignore index e4862de..1e3d35c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,5 +1,6 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.github$ ^\.travis\.yml$ ^appveyor\.yml$ ^codecov\.yml$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml new file mode 100644 index 0000000..5f2d726 --- /dev/null +++ b/.github/workflows/R-CMD-check.yml @@ -0,0 +1,66 @@ +on: + push: + branches: + - master + - devel + pull_request: + branches: + - master + - devel + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: ubuntu-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Checkout repo + uses: actions/checkout@v2 + + - name: Setup R + uses: r-lib/actions/setup-r@master + with: + r-version: ${{ matrix.config.r }} + + - name: Install pandoc + uses: r-lib/actions/setup-pandoc@v1 + + - name: Install tinyTeX + uses: r-lib/actions/setup-tinytex@v1 + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + sudo apt install -y curl libcurl4-doc libcurl4-openssl-dev + + - name: Install package dependencies + run: | + install.packages(c("remotes", "rcmdcheck", "covr", "sessioninfo")) + remotes::install_deps(dependencies = TRUE) + shell: Rscript {0} + + - name: Check package + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") + shell: Rscript {0} + + - name: Upload code coverage + if: runner.os == 'Linux' + run: | + covr::codecov() + shell: Rscript {0} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 479fdec..0000000 --- a/.travis.yml +++ /dev/null @@ -1,36 +0,0 @@ -branches: - only: - - master - -env: - global: - - RGL_USE_NULL=TRUE - -language: r -sudo: required -cache: packages -cran: https://cran.rstudio.com -warnings_are_errors: true - -before_install: - Rscript -e 'update.packages(ask = FALSE)' - -r_packages: - - covr - - devtools - -r: - - release - - devel - -r_github_packages: - - r-lib/covr - - r-lib/sessioninfo - -after_success: - - Rscript -e 'covr::codecov()' - -notifications: - email: - on_success: change - on_failure: change diff --git a/DESCRIPTION b/DESCRIPTION index c36ba66..06a3e18 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: origami Title: Generalized Framework for Cross-Validation -Version: 1.0.4 +Version: 1.0.5 Authors@R: c( person("Jeremy", "Coyle", email = "jeremyrcoyle@gmail.com", role = c("aut", "cre", "cph"), @@ -39,8 +39,9 @@ Suggests: rmarkdown, knitr, stringr, + glmnet, forecast, randomForest LazyData: true VignetteBuilder: knitr -RoxygenNote: 7.1.1.9000 +RoxygenNote: 7.1.2 diff --git a/NEWS.md b/NEWS.md index c72a0da..6e90442 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# origami 1.0.5 +* Addition of `glmnet` to package `Suggests` since used in the vignette. +* Resolve issue stemming from the partial matching of argument names + (https://github.com/tlverse/origami/issues/56) by specifying throughout. + # origami 1.0.4 * Fix the incorrect use of `future.apply` by including `future.seed = TRUE`, as per https://github.com/tlverse/origami/issues/48. @@ -6,10 +11,10 @@ * Sped up all pooled time-series fold functions with `lapply`. * All pooled time-series fold functions (`folds_rolling_origin_pooled`, `folds_rolling_window_pooled`, `folds_vfold_rolling_origin_pooled`, - `folds_vfold_rolling_window_pooled`) now allow for variability in the - number of observations for each independent unit (i.e. subject). + `folds_vfold_rolling_window_pooled`) now allow for variability in the + number of observations for each independent unit (i.e., the subject). * Added test for pooled time-series cross-validation with multi-unit time-series - to test the update above. In this test, the total number of observations + to test the update above. In this test, the total number of observations and the time-points for which there are observations varies across the units. # origami 1.0.3 diff --git a/R/fold_funs.R b/R/fold_funs.R index 67db76c..6399377 100644 --- a/R/fold_funs.R +++ b/R/fold_funs.R @@ -45,7 +45,7 @@ folds_vfold <- function(n, V = 10L) { warning("n <= V so using leave-one-out CV") return(folds_loo(n)) } - folds <- rep(seq_len(V), length = n) + folds <- rep(x = seq_len(V), length.out = n) # shuffle folds folds <- sample(folds) @@ -157,7 +157,7 @@ folds_rolling_origin_pooled <- function(n, t, id = NULL, time = NULL, # make IDs the same length as time if only one ID provided if (length(id) == 1L) { - id <- rep(id, length(time)) + id <- rep(x = id, length.out = length(time)) } # make skeleton dataset with IDs and times @@ -202,7 +202,7 @@ folds_rolling_window_pooled <- function(n, t, id = NULL, time = NULL, # make IDs the same length as time if only one ID provided if (length(id) == 1L) { - id <- rep(id, length(time)) + id <- rep(x = id, length.out = length(time)) } # make skeleton dataset with IDs and times @@ -249,7 +249,7 @@ folds_vfold_rolling_origin_pooled <- function(n, t, id = NULL, time = NULL, # make IDs the same length as time if only one ID provided if (length(id) == 1L) { - id <- rep(id, length(time)) + id <- rep(x = id, length.out = length(time)) } # make skeleton dataset with IDs and times @@ -268,7 +268,7 @@ folds_vfold_rolling_origin_pooled <- function(n, t, id = NULL, time = NULL, message(paste("Processing", length(ids), "samples with", t, "time points.")) # establish V folds for cross-validating ids - Vfold_allocation <- sample(rep(seq_len(V), length = length(ids))) + Vfold_allocation <- sample(rep(x = seq_len(V), length.out = length(ids))) Vfolds_skeleton <- lapply(seq_len(V), fold_from_foldvec, Vfold_allocation) # establish rolling origin forecast for time-series cross-validation @@ -316,7 +316,7 @@ folds_vfold_rolling_window_pooled <- function(n, t, id = NULL, time = NULL, # make IDs the same length as time if only one ID provided if (length(id) == 1) { - id <- rep(id, length(time)) + id <- rep(x = id, length.out = length(time)) } # make skeleton dataset with IDs and times @@ -335,7 +335,7 @@ folds_vfold_rolling_window_pooled <- function(n, t, id = NULL, time = NULL, message(paste("Processing", length(ids), "samples with", t, "time points.")) # establish V folds for cross-validating ids - Vfold_allocation <- sample(rep(seq_len(V), length = length(ids))) + Vfold_allocation <- sample(rep(x = seq_len(V), length.out = length(ids))) Vfolds_skeleton <- lapply(seq_len(V), fold_from_foldvec, Vfold_allocation) # establish rolling origin forecast for time-series cross-validation diff --git a/README.Rmd b/README.Rmd index 7de218c..353c802 100644 --- a/README.Rmd +++ b/README.Rmd @@ -14,8 +14,7 @@ knitr::opts_chunk$set( # R/`origami` -[![Travis-CI Build Status](https://travis-ci.org/tlverse/origami.svg?branch=master)](https://travis-ci.org/tlverse/origami) -[![Build status](https://ci.appveyor.com/api/projects/status/bfe2jd9a065jhql7?svg=true)](https://ci.appveyor.com/project/tlverse/origami) +[![R-CMD-check](https://github.com/tlverse/origami/workflows/R-CMD-check/badge.svg)](https://github.com/tlverse/origami/actions) [![Coverage Status](https://codecov.io/gh/tlverse/origami/branch/master/graph/badge.svg)](https://codecov.io/gh/tlverse/origami) [![CRAN](http://www.r-pkg.org/badges/version/origami)](http://www.r-pkg.org/pkg/origami) [![CRAN downloads](https://cranlogs.r-pkg.org/badges/origami)](https://CRAN.R-project.org/package=origami) @@ -34,11 +33,11 @@ Phillips](https://github.com/rachaelvphillips) --- -## Description +## What's `origami`? -`origami` is an R package that provides a general framework for the application -of cross-validation schemes to particular functions. By allowing arbitrary lists -of results, `origami` accommodates a range of cross-validation applications. +The `origami` R package provides a general framework for the application of +cross-validation schemes to particular functions. By allowing arbitrary lists of +results, `origami` accommodates a range of cross-validation applications. --- @@ -124,10 +123,8 @@ issue](https://github.com/tlverse/origami/issues). ## Contributions -It is our hope that `origami` will grow to be adopted as a backend for most any -procedure requiring cross-validation, including its integration into larger -machine learning frameworks. To that end, contributions are very welcome, though -we ask that interested contributors consult our [contribution +Contributions are very welcome. Interested contributors should consult our +[contribution guidelines](https://github.com/tlverse/origami/blob/master/CONTRIBUTING.md) prior to submitting a pull request. @@ -154,7 +151,7 @@ After using the `origami` R package, please cite it: ## License -© 2017-2020 [Jeremy R. Coyle](https://github.com/jeremyrcoyle) +© 2017-2021 [Jeremy R. Coyle](https://github.com/jeremyrcoyle) The contents of this repository are distributed under the GPL-3 license. See file `LICENSE` for details. diff --git a/README.md b/README.md index ed1cfbf..7024a1f 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,7 @@ # R/`origami` -[![Travis-CI Build -Status](https://travis-ci.org/tlverse/origami.svg?branch=master)](https://travis-ci.org/tlverse/origami) -[![Build -status](https://ci.appveyor.com/api/projects/status/bfe2jd9a065jhql7?svg=true)](https://ci.appveyor.com/project/tlverse/origami) +[![R-CMD-check](https://github.com/tlverse/origami/workflows/R-CMD-check/badge.svg)](https://github.com/tlverse/origami/actions) [![Coverage Status](https://codecov.io/gh/tlverse/origami/branch/master/graph/badge.svg)](https://codecov.io/gh/tlverse/origami) [![CRAN](http://www.r-pkg.org/badges/version/origami)](http://www.r-pkg.org/pkg/origami) @@ -32,11 +29,11 @@ Phillips](https://github.com/rachaelvphillips) ----- -## Description +## What’s `origami`? -`origami` is an R package that provides a general framework for the -application of cross-validation schemes to particular functions. By -allowing arbitrary lists of results, `origami` accommodates a range of +The `origami` R package provides a general framework for the application +of cross-validation schemes to particular functions. By allowing +arbitrary lists of results, `origami` accommodates a range of cross-validation applications. ----- @@ -78,7 +75,7 @@ estimate of the mean: ``` r library(stringr) library(origami) -#> origami v1.0.3: Generalized Framework for Cross-Validation +#> origami v1.0.4: Generalized Framework for Cross-Validation set.seed(4795) data(mtcars) @@ -90,7 +87,6 @@ head(mtcars) #> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 #> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 #> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 - # build a cv_fun that wraps around lm cv_lm <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula @@ -133,11 +129,8 @@ If you encounter any bugs or have any specific feature requests, please ## Contributions -It is our hope that `origami` will grow to be adopted as a backend for -most any procedure requiring cross-validation, including its integration -into larger machine learning frameworks. To that end, contributions are -very welcome, though we ask that interested contributors consult our -[contribution +Contributions are very welcome. Interested contributors should consult +our [contribution guidelines](https://github.com/tlverse/origami/blob/master/CONTRIBUTING.md) prior to submitting a pull request. @@ -166,7 +159,7 @@ After using the `origami` R package, please cite it: ## License -© 2017-2020 [Jeremy R. Coyle](https://github.com/jeremyrcoyle) +© 2017-2021 [Jeremy R. Coyle](https://github.com/jeremyrcoyle) The contents of this repository are distributed under the GPL-3 license. See file `LICENSE` for details. diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 31ba01e..0000000 --- a/appveyor.yml +++ /dev/null @@ -1,69 +0,0 @@ -# DO NOT CHANGE the "init" and "install" sections below - -# Download script file from GitHub -init: - ps: | - $ErrorActionPreference = "Stop" - Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" - Import-Module '..\appveyor-tool.ps1' -install: - ps: Bootstrap - -cache: - - C:\RLibrary -> appveyor.yml - -# Adapt as necessary starting from here -branches: - only: - - master - -environment: - global: - WARNINGS_ARE_ERRORS: 0 - R_ARCH: x64 - USE_RTOOLS: true - -build_script: - - travis-tool.sh install_deps - - travis-tool.sh install_github r-lib/covr - - travis-tool.sh install_github r-lib/sessioninfo - -test_script: - - travis-tool.sh run_tests - -on_failure: - - 7z a failure.zip *.Rcheck\* - - appveyor PushArtifact failure.zip - -on_success: - - Rscript -e "covr::codecov()" - -artifacts: - - path: '*.Rcheck\**\*.log' - name: Logs - - - path: '*.Rcheck\**\*.out' - name: Logs - - - path: '*.Rcheck\**\*.fail' - name: Logs - - - path: '*.Rcheck\**\*.Rout' - name: Logs - - - path: '\*_*.tar.gz' - name: Bits - - - path: '\*_*.zip' - name: Bits - -notifications: - - provider: Email - to: - - jeremyrcoyle@gmail.com - - nh@nimahejazi.org - subject: 'r-appveyor build {{status}}' - message: "https://ci.appveyor.com/project/tlverse/origami" - on_build_success: false - on_build_failure: true - on_build_status_changed: true diff --git a/cran-comments.md b/cran-comments.md index 323a2b0..fa9eacf 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,8 +1,7 @@ ## Test environments -* latest macOS (local), R 3.6.0 -* ubuntu 14.04 (on travis-ci), R 3.6.1 -* ubuntu 18.04 (local), R 3.6.2 -* windows (on appveyor-ci), R 3.6.1 +* ubuntu 20.04 (local + GitHub Actions), R 4.1.1 +* macOS 10.15 (local + GitHub Actions), R 4.1.1 +* windows 2019 (on GitHub Actions), R 4.1.1 ## R CMD check results There were no ERRORs, WARNINGs, or NOTEs diff --git a/docs/404.html b/docs/404.html index 340601d..ed5e6f3 100644 --- a/docs/404.html +++ b/docs/404.html @@ -81,7 +81,7 @@
@@ -143,7 +143,7 @@vignettes/generalizedCV.Rmd
generalizedCV.Rmd
We’ll start by examining a fairly simple data set:
- +## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
@@ -113,8 +114,9 @@
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
One might be interested in examining how the efficiency of a car, as measured by miles-per-gallon (mpg), is explained by various technical aspects of the car, with data across a variety of different models of cars. Linear regression is perhaps the simplest statistical procedure that could be used to make such deductions. Let’s try it out:
- +##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
@@ -143,45 +145,51 @@
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
We can assess how well the model fits the data by comparing the predictions of the linear model to the true outcomes observed in the data set. This is the well known (and standard) mean squared error. We can extract that from the lm
model object like so:
The mean squared error is 4.6092009. There is an important problem that arises when we assess the model in this way – that is, we have trained our linear regression model on the full data set and assessed the error on the full data set, using up all of our data. We, of course, are generally not interested in how well the model explains variation in the observed data; rather, we are interested in how the explanation provided by the model generalizes to a target population from which the sample is presumably derived. Having used all of our available data, we cannot honestly evaluate how well the model fits (and thus explains) variation at the population level.
To resolve this issue, cross-validation allows for a particular procedure (e.g., linear regression) to be implemented over subsets of the data, evaluating how well the procedure fits on a testing (“validation”) set, thereby providing an honest evaluation of the error.
We can easily add cross-validation to our linear regression procedure using origami
. First, let us define a new function to perform linear regression on a specific partition of the data (called a “fold”):
cv_lm <- function(fold, data, reg_form) { ++ out <- list(coef = data.frame(t(coef(mod))), + SE = ((preds - valid_data[, out_var_ind])^2)) + return(out) +}+cv_lm <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula - out_var <- as.character(unlist(str_split(reg_form, " "))[1]) - out_var_ind <- as.numeric(which(colnames(data) == out_var)) + out_var <- as.character(unlist(str_split(reg_form, " "))[1]) + out_var_ind <- as.numeric(which(colnames(data) == out_var)) # split up data into training and validation sets - train_data <- training(data) - valid_data <- validation(data) + train_data <- training(data) + valid_data <- validation(data) # fit linear model on training set and predict on validation set - mod <- lm(as.formula(reg_form), data = train_data) - preds <- predict(mod, newdata = valid_data) + mod <- lm(as.formula(reg_form), data = train_data) + preds <- predict(mod, newdata = valid_data) # capture results to be returned as output - out <- list(coef = data.frame(t(coef(mod))), - SE = ((preds - valid_data[, out_var_ind])^2)) - return(out) -}
Our cv_lm
function is rather simple: we merely split the available data into a training and validation sets, using the eponymous functions provided in origami
, fit the linear model on the training set, and evaluate the model on the testing set. This is a simple example of what origami
considers to be cv_fun
s – functions for using cross-validation to perform a particular routine over an input data set. Having defined such a function, we can simply generate a set of partitions using origami
’s make_folds
function, and apply our cv_lm
function over the resultant folds
object. Below, we replicate the resubstitution estimate of the error – we did this “by hand” above – using the functions make_folds
and cv_lm
.
library(origami)
## origami v1.0.4: Generalized Framework for Cross-Validation
-library(stringr) # used in defining the cv_lm function above
# resubstitution estimate -resub <- make_folds(mtcars, fold_fun = folds_resubstitution)[[1]] -resub_results <- cv_lm(fold = resub, data = mtcars, reg_form = "mpg ~ .") -mean(resub_results$SE)
## origami v1.0.5: Generalized Framework for Cross-Validation
+
+
+# resubstitution estimate
+resub <- make_folds(mtcars, fold_fun = folds_resubstitution)[[1]]
+resub_results <- cv_lm(fold = resub, data = mtcars, reg_form = "mpg ~ .")
+mean(resub_results$SE)
## [1] 4.609201
This (very nearly) matches the estimate of the error that we obtained above.
We can more honestly evaluate the error by V-fold cross-validation, which partitions the data into v subsets, fitting the model on \(v - 1\) of the subsets and evaluating on the subset that was held out for testing. This is repeated such that each subset is used for testing. We can easily apply our cv_lm
function using origami
’s cross_validate
(n.b., by default this performs 10-fold cross-validation):
# cross-validated estimate -folds <- make_folds(mtcars) -cvlm_results <- cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars, - reg_form = "mpg ~ .") -mean(cvlm_results$SE)
## [1] 10.31819
+
+# cross-validated estimate
+folds <- make_folds(mtcars)
+cvlm_results <- cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars,
+ reg_form = "mpg ~ .")
+mean(cvlm_results$SE)
## [1] 15.67379
Having performed 10-fold cross-validation, we quickly notice that our previous estimate of the model error (by resubstitution) was quite optimistic. The honest estimate of the error is several times larger.
To examine origami
further, let us return to our example analysis using the mtcars
data set. Here, we will write a new cv_fun
type object. As an example, we will use L. Breiman’s randomForest
:
cv_rf <- function(fold, data, reg_form) { ++ out <- list(coef = data.frame(mod$coefs), + SE = ((preds - valid_data[, out_var_ind])^2)) + return(out) +}+cv_rf <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula - out_var <- as.character(unlist(str_split(reg_form, " "))[1]) - out_var_ind <- as.numeric(which(colnames(data) == out_var)) + out_var <- as.character(unlist(str_split(reg_form, " "))[1]) + out_var_ind <- as.numeric(which(colnames(data) == out_var)) # define training and validation sets based on input object of class "folds" - train_data <- training(data) - valid_data <- validation(data) + train_data <- training(data) + valid_data <- validation(data) # fit Random Forest regression on training set and predict on holdout set - mod <- randomForest(formula = as.formula(reg_form), data = train_data) - preds <- predict(mod, newdata = valid_data) + mod <- randomForest(formula = as.formula(reg_form), data = train_data) + preds <- predict(mod, newdata = valid_data) # define output object to be returned as list (for flexibility) - out <- list(coef = data.frame(mod$coefs), - SE = ((preds - valid_data[, out_var_ind])^2)) - return(out) -}
Above, in writing our cv_rf
function to cross-validate randomForest
, we used our previous function cv_lm
as an example. For now, individual cv_fun
s must be written by hand; however, in future releases, a wrapper may be available to support auto-generating cv_fun
s to be used with origami
.
Below, we use cross_validate
to apply our new cv_rf
function over the folds
object generated by make_folds
.
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
-folds <- make_folds(mtcars) -cvrf_results <- cross_validate(cv_fun = cv_rf, folds = folds, data = mtcars, - reg_form = "mpg ~ .") -mean(cvrf_results$SE)
## [1] 6.149974
+
+folds <- make_folds(mtcars)
+cvrf_results <- cross_validate(cv_fun = cv_rf, folds = folds, data = mtcars,
+ reg_form = "mpg ~ .")
+mean(cvrf_results$SE)
## [1] 5.308908
Using 10-fold cross-validation (the default), we obtain an honest estimate of the prediction error of random forests. From this, we gather that the use of origami
’s cross_validate
procedure can be generalized to arbitrary esimation techniques, given availability of an appropriate cv_fun
function.
Cross-validation can also be used for forecast model selection in a time series setting. Here, the partitioning scheme mirrors the application of the forecasting model: We’ll train the data on past observations (either all available or a recent subset), and then use the model forecast (predict), the next few observations. Consider the AirPassengers
dataset, a monthly time series of passenger air traffic in thousands of people.
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949 112 118 132 129 121 135 148 148 136 119 104 118
## 1950 115 126 141 135 125 149 170 170 158 133 114 140
@@ -264,44 +276,46 @@
## 1959 360 342 406 396 420 472 548 559 463 407 362 405
## 1960 417 391 419 461 472 535 622 606 508 461 390 432
Suppose we want to pick between two forecasting models, stl
, and arima
(the details of these models are not important for this example). We can do that by evaluating their forecasting performance.
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
-folds = make_folds(AirPassengers, fold_fun=folds_rolling_origin, - first_window = 36, validation_size = 24) -fold = folds[[1]] ++mses = cross_validate(cv_fun = cv_forecasts, folds = folds, + data = AirPassengers)$mse +colMeans(mses[, c("arima", "stl")])+folds = make_folds(AirPassengers, fold_fun=folds_rolling_origin, + first_window = 36, validation_size = 24) +fold = folds[[1]] # function to calculate cross-validated squared error -cv_forecasts <- function(fold, data) { - train_data <- training(data) - valid_data <- validation(data) - valid_size <- length(valid_data) +cv_forecasts <- function(fold, data) { + train_data <- training(data) + valid_data <- validation(data) + valid_size <- length(valid_data) - train_ts <- ts(log10(train_data), frequency = 12) + train_ts <- ts(log10(train_data), frequency = 12) # borrowed from AirPassengers help - arima_fit <- arima(train_ts, c(0, 1, 1), - seasonal = list(order = c(0, 1, 1), - period = 12)) - raw_arima_pred <- predict(arima_fit, n.ahead = valid_size) - arima_pred <- 10^raw_arima_pred$pred - arima_MSE <- mean((arima_pred - valid_data)^2) + arima_fit <- arima(train_ts, c(0, 1, 1), + seasonal = list(order = c(0, 1, 1), + period = 12)) + raw_arima_pred <- predict(arima_fit, n.ahead = valid_size) + arima_pred <- 10^raw_arima_pred$pred + arima_MSE <- mean((arima_pred - valid_data)^2) # stl model - stl_fit <- stlm(train_ts, s.window = 12) - raw_stl_pred = forecast(stl_fit, h = valid_size) - stl_pred <- 10^raw_stl_pred$mean - stl_MSE <- mean((stl_pred - valid_data)^2) + stl_fit <- stlm(train_ts, s.window = 12) + raw_stl_pred = forecast(stl_fit, h = valid_size) + stl_pred <- 10^raw_stl_pred$mean + stl_MSE <- mean((stl_pred - valid_data)^2) - out <- list(mse = data.frame(fold = fold_index(), - arima = arima_MSE, stl = stl_MSE)) - return(out) -} + out <- list(mse = data.frame(fold = fold_index(), + arima = arima_MSE, stl = stl_MSE)) + return(out) +} -mses = cross_validate(cv_fun = cv_forecasts, folds = folds, - data = AirPassengers)$mse -colMeans(mses[, c("arima", "stl")])
## arima stl
## 667.2477 925.7137
## R version 3.6.3 (2020-02-29)
+## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
-## Running under: Ubuntu 18.04.4 LTS
+## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
-## BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
-## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
+## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
+## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
@@ -329,26 +343,29 @@
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
-## [1] forecast_8.12 randomForest_4.6-14 stringr_1.4.0
-## [4] origami_1.0.4
+## [1] forecast_8.15 randomForest_4.6-14 stringr_1.4.0
+## [4] origami_1.0.5
##
## loaded via a namespace (and not attached):
-## [1] zoo_1.8-8 tidyselect_1.1.0 xfun_0.15 urca_1.3-0
-## [5] purrr_0.3.4 listenv_0.8.0 lattice_0.20-41 colorspace_1.4-1
-## [9] vctrs_0.3.1 generics_0.0.2 htmltools_0.5.0 yaml_2.2.1
-## [13] rlang_0.4.6 pkgdown_1.5.1 pillar_1.4.4 glue_1.4.1
-## [17] TTR_0.23-6 lifecycle_0.2.0 quantmod_0.4.17 timeDate_3043.102
-## [21] munsell_0.5.0 gtable_0.3.0 future_1.17.0 codetools_0.2-16
-## [25] memoise_1.1.0 evaluate_0.14 knitr_1.29 tseries_0.10-47
-## [29] lmtest_0.9-37 curl_4.3 parallel_3.6.3 xts_0.12-0
-## [33] Rcpp_1.0.5 scales_1.1.1 backports_1.1.8 desc_1.2.0
-## [37] abind_1.4-5 fs_1.4.2 fracdiff_1.5-1 ggplot2_3.3.2
-## [41] digest_0.6.25 stringi_1.4.6 dplyr_1.0.0 grid_3.6.3
-## [45] rprojroot_1.3-2 quadprog_1.5-8 tools_3.6.3 magrittr_1.5
-## [49] tibble_3.0.1 crayon_1.3.4 future.apply_1.6.0 pkgconfig_2.0.3
-## [53] MASS_7.3-51.6 ellipsis_0.3.1 data.table_1.12.8 assertthat_0.2.1
-## [57] rmarkdown_2.3 R6_2.4.1 globals_0.12.5 nlme_3.1-148
-## [61] nnet_7.3-14 compiler_3.6.3
+## [1] Rcpp_1.0.7 lattice_0.20-45 listenv_0.8.0 zoo_1.8-9
+## [5] assertthat_0.2.1 rprojroot_2.0.2 digest_0.6.28 lmtest_0.9-38
+## [9] utf8_1.2.2 parallelly_1.28.1 R6_2.5.1 evaluate_0.14
+## [13] ggplot2_3.3.5 pillar_1.6.2 rlang_0.4.11 curl_4.3.2
+## [17] data.table_1.14.0 TTR_0.24.2 fracdiff_1.5-1 jquerylib_0.1.4
+## [21] rmarkdown_2.11 pkgdown_1.6.1 textshaping_0.3.5 desc_1.3.0
+## [25] munsell_0.5.0 compiler_4.1.1 xfun_0.26 pkgconfig_2.0.3
+## [29] systemfonts_1.0.2 urca_1.3-0 globals_0.14.0 htmltools_0.5.2
+## [33] nnet_7.3-16 tidyselect_1.1.1 tibble_3.1.4 quadprog_1.5-8
+## [37] codetools_0.2-18 fansi_0.5.0 future_1.22.1 crayon_1.4.1
+## [41] dplyr_1.0.7 grid_4.1.1 nlme_3.1-153 jsonlite_1.7.2
+## [45] gtable_0.3.0 lifecycle_1.0.0 DBI_1.1.1 magrittr_2.0.1
+## [49] scales_1.1.1 quantmod_0.4.18 future.apply_1.8.1 stringi_1.7.4
+## [53] cachem_1.0.6 tseries_0.10-48 fs_1.5.0 timeDate_3043.102
+## [57] bslib_0.3.0 xts_0.12.1 ellipsis_0.3.2 ragg_1.1.3
+## [61] vctrs_0.3.8 generics_0.1.0 tools_4.1.1 glue_1.4.2
+## [65] purrr_0.3.4 abind_1.4-5 parallel_4.1.1 fastmap_1.1.0
+## [69] yaml_2.2.1 colorspace_2.0-2 memoise_2.0.0 knitr_1.34
+## [73] sass_0.4.0
@@ -87,19 +87,21 @@
Authors: Jeremy Coyle, Nima Hejazi, Ivana Malenica, and Rachael Phillips
origami
is an R package that provides a general framework for the application of cross-validation schemes to particular functions. By allowing arbitrary lists of results, origami
accommodates a range of cross-validation applications.
origami
?
+The origami
R package provides a general framework for the application of cross-validation schemes to particular functions. By allowing arbitrary lists of results, origami
accommodates a range of cross-validation applications.
For standard use, we recommend installing the package from CRAN via
-install.packages("origami")
+install.packages("origami")
You can install a stable release of origami
from GitHub via devtools
with:
devtools::install_github("tlverse/origami")
+devtools::install_github("tlverse/origami")
This minimal example shows how to use origami
to apply cross-validation to the computation of a simple descriptive statistic using a sample data set. In particular, we obtain a cross-validated estimate of the mean:
library(stringr) -library(origami) ++folds <- make_folds(mtcars) +results <- cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars, + reg_form = "mpg ~ .") +mean(results$SE) +#> [1] 15.18558+library(stringr) +library(origami) #> origami v1.0.4: Generalized Framework for Cross-Validation -set.seed(4795) +set.seed(4795) -data(mtcars) -head(mtcars) +data(mtcars) +head(mtcars) #> mpg cyl disp hp drat wt qsec vs am gear carb #> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 #> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 @@ -126,32 +129,31 @@
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 #> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 #> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 - # build a cv_fun that wraps around lm -cv_lm <- function(fold, data, reg_form) { +cv_lm <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula - out_var <- as.character(unlist(str_split(reg_form, " "))[1]) - out_var_ind <- as.numeric(which(colnames(data) == out_var)) + out_var <- as.character(unlist(str_split(reg_form, " "))[1]) + out_var_ind <- as.numeric(which(colnames(data) == out_var)) # split up data into training and validation sets - train_data <- training(data) - valid_data <- validation(data) + train_data <- training(data) + valid_data <- validation(data) # fit linear model on training set and predict on validation set - mod <- lm(as.formula(reg_form), data = train_data) - preds <- predict(mod, newdata = valid_data) + mod <- lm(as.formula(reg_form), data = train_data) + preds <- predict(mod, newdata = valid_data) # capture results to be returned as output - out <- list(coef = data.frame(t(coef(mod))), - SE = ((preds - valid_data[, out_var_ind])^2)) - return(out) -} + out <- list(coef = data.frame(t(coef(mod))), + SE = ((preds - valid_data[, out_var_ind])^2)) + return(out) +} -folds <- make_folds(mtcars) -results <- cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars, - reg_form = "mpg ~ .") -mean(results$SE) -#> [1] 15.18558
For details on how to write wrappers (cv_fun
s) for use with origami::cross_validate
, please consult the documentation and vignettes that accompany the package.
It is our hope that origami
will grow to be adopted as a backend for most any procedure requiring cross-validation, including its integration into larger machine learning frameworks. To that end, contributions are very welcome, though we ask that interested contributors consult our contribution guidelines prior to submitting a pull request.
Contributions are very welcome. Interested contributors should consult our contribution guidelines prior to submitting a pull request.
© 2017-2020 Jeremy R. Coyle
+© 2017-2021 Jeremy R. Coyle
The contents of this repository are distributed under the GPL-3 license. See file LICENSE
for details.
NEWS.md
glmnet
to package Suggests
since used in the vignette.future.apply
by including future.seed = TRUE
, as per https://github.com/tlverse/origami/issues/48.folds_rolling_origin_pooled
and folds_rolling_window_pooled
, as per https://github.com/tlverse/origami/pull/50.lapply
.folds_rolling_origin_pooled
, folds_rolling_window_pooled
, folds_vfold_rolling_origin_pooled
, folds_vfold_rolling_window_pooled
) now allow for variability in the number of observations for each independent unit (i.e. subject).folds_rolling_origin_pooled
, folds_rolling_window_pooled
, folds_vfold_rolling_origin_pooled
, folds_vfold_rolling_window_pooled
) now allow for variability in the number of observations for each independent unit (i.e., the subject).folds_rolling_origin_pooled
, folds_rolling_window_pooled
, folds_vfold_rolling_origin_pooled
, folds_vfold_rolling_window_pooled
.tests/testthat/test-overall.R
to use appropriate hard-coded values that depend on the R version. Note that these differ between R < 3.6.0 and R > 3.6.0 due to an important change in the default PRNG.folds2foldvec
for easy conversion between the folds structure used by origami
and other packages (e.g., glmnet
).future_lapply
from the new more modular package future.apply
rather than from future
, matching the author’s recommendation.Check ID and Time Compatibility
check_id_and_time(id, time)+
check_id_and_time(id, time)
A function that takes a 'fold' as it's first argument and
returns a list of results from that fold. NOTE: the use of an argument
named 'X' is specifically disallowed in any input function for compliance
-with the functions |
|
use_future | A |
---|---|