Skip to content

Commit

Permalink
Merge make dummies (#243)
Browse files Browse the repository at this point in the history
  • Loading branch information
aredelmeier authored Nov 20, 2020
1 parent f1a693e commit f7d742c
Show file tree
Hide file tree
Showing 10 changed files with 394 additions and 292 deletions.
3 changes: 2 additions & 1 deletion .lintr
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ linters: with_defaults(
line_length_linter = lintr::line_length_linter(120),
object_name_linter = NULL,
object_usage_linter = NULL,
seq_linter = NULL
seq_linter = NULL,
cyclocomp_linter = lintr::cyclocomp_linter()
)
exclusions: list(
"inst/scripts/compare_shap_python.R",
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ S3method(prepare_data,ctree)
S3method(prepare_data,empirical)
S3method(prepare_data,gaussian)
export(aicc_full_single_cpp)
export(apply_dummies)
export(correction_matrix_cpp)
export(create_ctree)
export(explain)
Expand Down Expand Up @@ -66,6 +65,8 @@ importFrom(graphics,hist)
importFrom(graphics,plot)
importFrom(graphics,rect)
importFrom(stats,as.formula)
importFrom(stats,model.frame)
importFrom(stats,model.matrix)
importFrom(stats,predict)
importFrom(utils,head)
importFrom(utils,tail)
Expand Down
224 changes: 149 additions & 75 deletions R/features.R
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,27 @@ helper_feature <- function(m, feature_sample) {

#' Initiate the making of dummy variables
#'
#' @param data data.table or data.frame. Includes all the features (both factors and possibly others).
#' @param traindata data.table or data.frame.
#'
#' @param testdata data.table or data.frame. New data that has the same
#' feature names, types, and levels as \code{traindata}.
#'
#' @return A list that contains the following entries:
#' \describe{
#' \item{obj}{List, Contains \describe{
#' \item{features}{Vector. Contains the names of all the features in \code{data}.}
#' \item{factor_features}{Vector. Contains the names of all the factors in \code{data}.}
#' \item{factor_list}{List. Contains each factor and its vector of levels.}
#' \item{contrasts_list}{List. Contains all the contrasts of the factors.}
#' }}
#' \item{train_dummies}{A data.frame containing all of the factors in \code{traindata} as
#' one-hot encoded variables.}
#' \item{test_dummies}{A data.frame containing all of the factors in \code{testdata} as
#' one-hot encoded variables.}
#' \item{traindata_new}{Original traindata with correct column ordering and factor levels. To be passed to
#' \code{shapr()}.}
#' \item{testdata_new}{Original testdata with correct column ordering and factor levels. To be passed to
#' \code{explain()}.}
#' }
#'
#' @export
Expand All @@ -191,117 +204,178 @@ helper_feature <- function(m, feature_sample) {
#' @examples
#'
#' data("Boston", package = "MASS")
#'
#' x_var <- c("lstat", "chas", "rad", "indus")
#' x_var <- c("lstat", "rm", "dis", "indus")
#' y_var <- "medv"
#' x_train <- as.data.frame(Boston[401:411, x_var])
#' y_train <- Boston[401:408, y_var]
#' x_test <- as.data.frame(Boston[1:4, x_var])
#'
#' # convert to factors
#' Boston$rad = as.factor(Boston$rad)
#' Boston$chas = as.factor(Boston$chas)
#' # convert to factors for illustational purpose
#' x_train$rm <- factor(round(x_train$rm))
#' x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm))
#'
#' x_train <- Boston[-1:-6, x_var]
#' y_train <- Boston[-1:-6, y_var]
#' x_test <- Boston[1:6, x_var]
#' dummylist <- make_dummies(traindata = x_train, testdata = x_test)
#'
#' dummylist <- make_dummies(data = rbind(x_train, x_test))
#'
make_dummies <- function(data) {
make_dummies <- function(traindata, testdata) {

contrasts <- features <- factor_features <- NULL # due to NSE notes in R CMD check
if (is.null(colnames(data))) {
stop("data must have column names.")
contrasts <- features <- factor_features <- NULL # due to NSE notes in R CMD check
# <- model.frame <- model.matrix
if (is.null(colnames(traindata))) {
stop("traindata must have column names.")
}
if (is.null(colnames(testdata))) {
stop("testdata must have column names.")
}
data <- data.table::as.data.table(as.data.frame(data, stringsAsFactors = FALSE))

traindata <- data.table::as.data.table(traindata)
testdata <- data.table::as.data.table(testdata)

features <- colnames(data)
if (length(unique(features)) < length(features)) {
stop("Features must have unique names.")
if (length(colnames(traindata)) != length(colnames(testdata))) {
stop("traindata and testdata must have the same number of columns.")
}
p <- sapply(data[, features, with = FALSE], is.factor)
p_sum <- sum(p)

if (p_sum > 0) {
factor_features <- features[p]
factor_list <- lapply(data[, factor_features, with = FALSE], levels)
if (!all(sort(colnames(traindata)) == sort(colnames(testdata)))) {
stop("traindata and testdata must have the same column names.")
}

features <- colnames(traindata)
# feature names must be unique
if (any(duplicated(features))) {
stop("Both traindata and testdata must have unique column names.")
}

# Check if any features have empty names i.e ""
if (any(features == "")) {
stop("One or more features is missing a name.")
}

# In case the testing data has a different column order than the training data:
testdata <- testdata[, features, with = FALSE]

# Check if the features all have class "integer", "numeric" or "factor
if (!all(sapply(traindata, class) %in% c("integer", "numeric", "factor"))) {
stop("All traindata must have class integer, numeric or factor.")
}
if (!all(sapply(testdata, class) %in% c("integer", "numeric", "factor"))) {
stop("All testdata must have class integer, numeric or factor.")
}
# Check if traindata and testdata have features with the same class
if (!all(sapply(traindata, class) == sapply(testdata, class))) {
stop("All traindata and testdata must have the same classes.")
}

# Check that traindata and testdata have the same levels for the factor features
is_factor <- sapply(traindata, is.factor) # check which features are factors
nb_factor <- sum(is_factor)

list_levels_train <- lapply(traindata[, is_factor, with = FALSE], function(x) sort(levels(x)))
list_levels_test <- lapply(testdata[, is_factor, with = FALSE], function(x) sort(levels(x)))

if (!identical(list_levels_train, list_levels_test)) {
stop("Levels of categorical variables in traindata and testdata must be the same.")
}

# re-level traindata and testdata
for (i in names(list_levels_train)) {
traindata[[i]] <- factor(traindata[[i]], levels = list_levels_train[[i]])
}
for (i in names(list_levels_test)) {
testdata[[i]] <- factor(testdata[[i]], levels = list_levels_test[[i]])
}

if (nb_factor > 0) {
factor_features <- features[is_factor]
factor_list <- lapply(traindata[, factor_features, with = FALSE], levels)
} else {
factor_features <- NULL
factor_list <- NULL
}
contrasts_list <- lapply(data[, factor_features, with = FALSE], contrasts, contrasts = FALSE)

contrasts_list <- lapply(traindata[, factor_features, with = FALSE], contrasts, contrasts = FALSE)

obj <- list(features = features,
factor_features = factor_features,
factor_list = factor_list,
contrasts_list = contrasts_list,
class_vector = sapply(traindata, class))

# get train dummies
m <- model.frame(data = traindata,
xlev = obj$factor_list)
train_dummies <- model.matrix(object = ~. + 0,
data = m,
contrasts.arg = obj$contrasts_list)

# get test dummies
m <- model.frame(data = testdata,
xlev = obj$factor_list)
test_dummies <- model.matrix(object = ~. + 0,
data = m,
contrasts.arg = obj$contrasts_list)


r <- list(data = data,
features = features,
factor_features = factor_features,
factor_list = factor_list,
contrasts_list = contrasts_list)
return(r)
return(list(obj = obj, train_dummies = train_dummies, test_dummies = test_dummies, traindata_new = traindata,
testdata_new = testdata))

}

#' Make dummy variables
#' Make dummy variables - this is an internal function intended only to be used in
#' predict_model.xgb.Booster()
#'
#' @param obj List. Output of \code{make_dummies}.
#' @param obj List. Output of \code{make_dummies$obj}.
#'
#' @param newdata data.table or data.frame. New data (features) that has the same
#' features as the data used in \code{make_dummies}.
#' @param testdata data.table or data.frame. New data that has the same
#' feature names, types, and levels as \code{obj$data}.
#'
#' @return A data.frame containing all of the factors in \code{new_data} as
#' @return A data.frame containing all of the factors in \code{testdata} as
#' one-hot encoded variables.
#'
#' @export
#'
#' @author Annabelle Redelmeier
#'
#' @examples
#'
#' data("Boston", package = "MASS")
#'
#' x_var <- c("lstat", "chas", "rad", "indus")
#' y_var <- "medv"
#'
#' # convert to factors
#' Boston$rad = as.factor(Boston$rad)
#' Boston$chas = as.factor(Boston$chas)
#'
#' x_train <- Boston[-1:-6, x_var]
#' y_train <- Boston[-1:-6, y_var]
#' x_test <- Boston[1:6, x_var]
#'
#' dummylist <- make_dummies(data = rbind(x_train, x_test))
#'
#' x_train_dummies <- apply_dummies(obj = dummylist, newdata = x_train)
#' @keywords internal
#'
apply_dummies <- function(obj, newdata) {
apply_dummies <- function(obj, testdata) {

features <- NULL # due to NSE notes in R CMD check
if (is.null(colnames(testdata))) {
stop("testdata must have column names.")
}

features <- model.frame <- model.matrix <- NULL # due to NSE notes in R CMD check
if (is.null(newdata)) {
stop("newdata needs to be included.")
testdata <- data.table::as.data.table(testdata)
features <- obj$features

if (length(features) != length(colnames(testdata))) {
stop("testdata must have the same number of columns as traindata.")
}
if (is.null(colnames(newdata))) {
stop("newdata must have column names.")
if (!all(sort(features) == sort(colnames(testdata)))) {
stop("testdata must have the same column names as traindata.")
}
newdata <- data.table::as.data.table(as.data.frame(newdata, stringsAsFactors = FALSE))

# in case the testing data has a different column order or more columns than the training data:
testdata <- testdata[, features, with = FALSE]

# check all features are in newdata
if (!all(obj$features %in% names(newdata))) {
stop("Some features missing from newdata.")
if (!all(sapply(testdata, class) %in% c("integer", "numeric", "factor"))) {
stop("All testdata must have class integer, numeric or factor.")
}
if (!all(obj$class_vector == sapply(testdata, class))) {
stop("All traindata and testdata must have the same classes.")
}

# check that all features have the correct data type
for (i in obj$features) {
if (class(newdata[[i]]) != class(obj$data[[i]])) {
stop("All features must have the same type as original data.")
}
# Check that traindata and testdata have the same levels for the factor features
is_factor <- obj$factor_features
list_levels_train <- obj$factor_list
list_levels_test <- lapply(testdata[, is_factor, with = FALSE], function(x) sort(levels(x)))

if (!identical(list_levels_train, list_levels_test)) {
stop("Levels of categorical variables in traindata and testdata must be the same.")
}

features <- obj$features
newdata_sub <- newdata[, features, with = FALSE]
# re-level testdata
for (i in names(list_levels_test)) {
testdata[[i]] <- factor(testdata[[i]], levels = list_levels_test[[i]])
}

m <- model.frame(data = newdata_sub,
#na.action = na.pass,
m <- model.frame(data = testdata,
xlev = obj$factor_list)

x <- model.matrix(object = ~. + 0,
Expand Down
4 changes: 2 additions & 2 deletions R/models.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ predict_model.xgb.Booster <- function(x, newdata) {
# Test model type
model_type <- model_type(x)

if (model_type %in% c("cat_regression", "cat_classification" ) ) {
newdata_dummy <- apply_dummies(obj = x$dummylist, newdata = newdata)
if (model_type %in% c("cat_regression", "cat_classification")) {
newdata_dummy <- apply_dummies(obj = x$dummylist, testdata = newdata)
predict(x, as.matrix(newdata_dummy))
} else {
predict(x, as.matrix(newdata))
Expand Down
4 changes: 4 additions & 0 deletions R/shapr-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#'
#' @importFrom stats as.formula
#'
#' @importFrom stats model.matrix
#'
#' @importFrom stats model.frame
#'
#' @importFrom Rcpp sourceCpp
#'
#' @keywords internal
Expand Down
Loading

0 comments on commit f7d742c

Please sign in to comment.