From febca9e96849df49d1d68ea6bb0bfe533f69530f Mon Sep 17 00:00:00 2001 From: DavZim Date: Mon, 23 Oct 2023 15:42:54 +0200 Subject: [PATCH] add first draft of describe (#5) --- NAMESPACE | 1 + R/check_data.R | 8 +- R/describe.R | 276 +++++++++++++++++++++++++++++++++ README.Rmd | 16 +- README.md | 95 +++++++++--- man/describe.Rd | 29 ++++ tests/testthat/test-describe.R | 133 ++++++++++++++++ 7 files changed, 526 insertions(+), 32 deletions(-) create mode 100644 R/describe.R create mode 100644 man/describe.Rd create mode 100644 tests/testthat/test-describe.R diff --git a/NAMESPACE b/NAMESPACE index 5580940..bb437b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(print,rule) S3method(print,ruleset) export(check_data) +export(describe) export(detect_backend) export(filter_fails) export(plot_res) diff --git a/R/check_data.R b/R/check_data.R index 89dd000..f1b3559 100644 --- a/R/check_data.R +++ b/R/check_data.R @@ -103,15 +103,15 @@ detect_backend <- function(x) { } else if ("tbl_sql" %in% cc) { - if (!has_pkg("DBI")) - stop("The DBI package needs to be installed in order to test a tbl_sql.") + if (!has_pkg("DBI") || !has_pkg("dbplyr")) + stop("The DBI and dbplyr packages need to be installed in order to test a tbl_sql.") backend <- "collectibles" } else if ("ArrowObject" %in% cc) { - if (!has_pkg("arrow")) - stop("The arrow package needs to be installed in order to test an ArrowObject.") + if (!has_pkg("arrow") || !has_pkg("dbplyr")) + stop("The arrow and dbplyr packages need to be installed in order to test an ArrowObject.") backend <- "collectibles" diff --git a/R/describe.R b/R/describe.R new file mode 100644 index 0000000..44ff9e9 --- /dev/null +++ b/R/describe.R @@ -0,0 +1,276 @@ +#' Describes a dataset +#' +#' Note that the current version is in the beta stadium at best, that means the +#' R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster +#' than arrow or SQL-based datasets. +#' +#' @param x a dataset, either a [`data.frame`], [`dplyr::tibble`], [`data.table::data.table`], +#' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection) +#' +#' @return a `data.frame`, `dplyr::tibble`, or `data.table::data.table` containing +#' a summary of the dataset given +#' @export +#' +#' @seealso Similar to [skimr::skim()](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html), +#' [summarytools::dfSummary()](https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary), +#' and [gtExtras::gt_plt_summary()](https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html) +#' +#' @examples +#' describe(mtcars) +describe <- function(x) { + + backend <- detect_backend(x) + + # make sure the input dataset has the right class + if (class(x)[[1]] == "data.frame") { + if (backend == "data.table") { + x <- data.table::as.data.table(x) + } else if (backend == "dplyr") { + x <- dplyr::as_tibble(x) + } + } + + if (backend == "base-r") { + describe_base_r(x) + } else if (backend == "dplyr") { + describe_dplyr(x) + } else if (backend == "data.table") { + describe_data.table(x) + } else if (backend == "collectibles") { + if ("tbl_sql" %in% class(x)) { + describe_sql(x) + } else if ("ArrowObject" %in% class(x)) { + describe_arrow(x) + } + } else { + stop(sprintf("Could not detect backend to describe %s", paste(class(x), collapse = ", "))) + } +} + + +# internal function to see which values should use the min/max etc part +is_numeric <- function(v) { + any(class(v) %in% c("integer", "numeric", "POSIXt")) +} + +# x <- mtcars +describe_base_r <- function(x, max_n = 3) { + ll <- lapply( + seq(ncol(x)), + function(i) { + v <- x[[i]] + type <- class(v)[[1]] + is_num <- is_numeric(v) + + tbl <- table(v) + uv <- unique(v) + tab <- tabulate(match(v, uv)) + tab_max <- which(tab == max(tab)) + # get the indices of the three highest counts + od <- order(tab, decreasing = TRUE)[seq(min(max_n, length(tab)))] + + nz <- if (!is_num) nchar(as.character(v)) + + data.frame( + var = names(x)[[i]], + type = type, + n = length(v), + n_distinct = length(unique(v)), + n_na = sum(is.na(v)), + most_frequent = paste(sprintf("%s (%s)", uv[od], tab[od]), + collapse = ", "), + + min = as.numeric(min(if (is_num) v else nz, na.rm = TRUE)), + mean = as.numeric(mean(if (is_num) v else nz, na.rm = TRUE)), + median = as.numeric(median(if (is_num) v else nz, na.rm = TRUE)), + max = as.numeric(max(if (is_num) v else nz, na.rm = TRUE)), + sd = as.numeric(sd(if (is_num) v else nz, na.rm = TRUE)) + ) + } + ) + + do.call(rbind, ll) +} + +# x <- mtcars |> tibble::as_tibble() +describe_dplyr <- function(x, max_n = 3) { + ll <- lapply( + names(x), + function(v) { + mc <- x |> + dplyr::count(.data[[v]]) |> + dplyr::slice_max(n, n = max_n, with_ties = FALSE) + + type <- class(mc[[1]])[[1]] + is_num <- is_numeric(mc[[1]]) + mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ") + + nz <- if (!is_num) nchar(as.character(x[[v]])) + x |> + dplyr::summarise( + var = v, + type = type, + n = dplyr::n(), + n_distinct = dplyr::n_distinct(.data[[v]]), + n_na = sum(is.na(.data[[v]])), + most_frequent = mf, + min = as.numeric(min(if (is_num) .data[[v]] else nz, na.rm = TRUE)), + mean = as.numeric(mean(if (is_num) .data[[v]] else nz, na.rm = TRUE)), + median = as.numeric(median(if (is_num) .data[[v]] else nz, na.rm = TRUE)), + max = as.numeric(max(if (is_num) .data[[v]] else nz, na.rm = TRUE)), + sd = as.numeric(sd(if (is_num) .data[[v]] else nz, na.rm = TRUE)) + ) + } + ) + + dplyr::bind_rows(ll) +} + +# x <- mtcars |> data.table::as.data.table() +describe_data.table <- function(x, max_n = 3) { + ll <- lapply( + names(x), + function(v) { + mc <- x[, .(n = .N), by = v][order(n, decreasing = TRUE)][seq(max_n)] + + type <- class(mc[[1]])[[1]] + is_num <- is_numeric(mc[[1]]) + mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ") + + nz <- if (!is_num) nchar(as.character(x[[v]])) + + x[, .( + var = v, + type = type, + n = .N, + n_distinct = data.table::uniqueN(get(v)), + n_na = sum(is.na(get(v))), + most_frequent = mf, + + min = as.numeric(min(if (is_num) get(v) else nz, na.rm = TRUE)), + mean = as.numeric(mean(if (is_num) get(v) else nz, na.rm = TRUE)), + median = as.numeric(median(if (is_num) get(v) else nz, na.rm = TRUE)), + max = as.numeric(max(if (is_num) get(v) else nz, na.rm = TRUE)), + sd = as.numeric(sd(if (is_num) get(v) else nz, na.rm = TRUE)) + )] + } + ) + + data.table::rbindlist(ll) +} + + +# RSQLite, duckdb etc +describe_sql <- function(x, max_n = 3) { + ll <- lapply(names(x), function(v) { + mc <- x |> + dplyr::count(.data[[v]]) |> + dplyr::slice_max(n, n = max_n, with_ties = FALSE) |> + dplyr::collect() + + type <- class(mc[[1]])[[1]] + is_num <- is_numeric(mc[[1]]) + mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ") + nn <- x |> + dplyr::distinct(.data[[v]]) |> + dplyr::summarise(n = dplyr::n()) |> + dplyr::collect() + nna <- x |> dplyr::filter(is.na(.data[[v]])) |> dplyr::collect() |> nrow() + + r <- dplyr::tibble( + var = v, + type = type, + n_distinct = nn[[1]], + n_na = nna, + most_frequent = mf + ) + + xx <- x |> + dplyr::select(dplyr::all_of(v)) |> + dplyr::rename(x := dplyr::all_of(v)) + if (!is_num) xx <- xx |> dplyr::mutate(x = nchar(as.character(x))) + + + rr <- try( + xx |> + dplyr::summarise( + min = min(x, na.rm = TRUE), + mean = mean(x, na.rm = TRUE), + median = median(x, na.rm = TRUE), + max = max(x, na.rm = TRUE), + sd = sd(x, na.rm = TRUE) + ) |> + dplyr::collect(), + silent = TRUE + ) + if (inherits(rr, "try-error")) { + rr <- dplyr::tibble( + min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_, + sd = NA_real_ + ) + } + + dplyr::bind_cols(r, rr) + }) + + dplyr::bind_rows(ll) +} + +# arrow::write_parquet(nycflights13::flights, "flights.parquet") +# x <- arrow::open_dataset("flights.parquet") +describe_arrow <- function(x, max_n = 3) { + # if x is a dbplyr connection string + ll <- lapply(names(x), function(v) { + mc <- x |> + dplyr::count(.data[[v]]) |> + dplyr::slice_max(n, n = max_n, with_ties = FALSE) |> + dplyr::collect() + + type <- class(mc[[1]])[[1]] + is_num <- is_numeric(mc[[1]]) + mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ") + nn <- x |> + dplyr::distinct(.data[[v]]) |> + dplyr::summarise(n = dplyr::n()) |> + dplyr::collect() + nna <- x |> dplyr::filter(is.na(.data[[v]])) |> nrow() + + r <- dplyr::tibble( + var = v, + type = type, + n_distinct = nn[[1]], + n_na = nna, + most_frequent = mf + ) + + if (is_num) { + xx <- x |> dplyr::transmute(x = get(v)) + } else { + xx <- x |> dplyr::transmute(x = nchar(as.character(get(v)))) + } + + suppressWarnings({ + rr <- try( + xx |> + dplyr::summarise( + min = min(x, na.rm = TRUE), + mean = mean(x, na.rm = TRUE), + median = median(x, na.rm = TRUE), + max = max(x, na.rm = TRUE), + sd = sd(x, na.rm = TRUE) + ) |> + dplyr::collect(), + silent = TRUE) + }) + if (inherits(rr, "try-error")) { + rr <- dplyr::tibble( + min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_, + sd = NA_real_ + ) + } + + dplyr::bind_cols(r, rr) + }) + + dplyr::bind_rows(ll) +} diff --git a/README.Rmd b/README.Rmd index 469b53f..5e8e4f8 100644 --- a/README.Rmd +++ b/README.Rmd @@ -32,9 +32,10 @@ That way, you can concentrate on writing the rules and making sure that your dat The package is lightweight as all the heavy dependencies are Suggests-only, that means if you want to use `data.table` for the task, you don't need to install the other packages (`arrow`, `DBI`, etc) unless you explicitly tell R to install all suggested packages as well when installing the package. -The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages. +The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages (see also `?detect_backend(data)`). By using the underlying technologies and handing over all evaluation of code to the backend, this package can deal with all sizes of data the backends can deal with. +The package also has a helper function to describe a dataset, see `?describe()`. ## Installation @@ -63,6 +64,9 @@ At the moment rules work in a window/vectorized approach only, that means that a ```{r example, message=FALSE} library(dataverifyr) +# create a dataset +data <- mtcars + # define a rule set within our R code; alternatively in a yaml file rules <- ruleset( rule(mpg > 10 & mpg < 30), # mpg goes up to 34 @@ -73,8 +77,11 @@ rules <- ruleset( # print the rules rules +# describe the dataset +describe(data) + # check if the data matches our rules -res <- check_data(mtcars, rules) +res <- check_data(data, rules) res ``` @@ -206,9 +213,8 @@ if (!file.exists(file)) download.file(url, file, method = "curl") file.size(file) / 1e6 # in MB # quick check of the filesize -d <- read_parquet(file) -dim(d) -names(d) +d <- open_dataset(file) +describe(d) # write the dataset to disk write_dataset(d, "nyc-taxi-data") diff --git a/README.md b/README.md index f84e3c4..daf3cd9 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,13 @@ unless you explicitly tell R to install all suggested packages as well when installing the package. The backend for your analysis is automatically chosen based on the type -of input dataset as well as the available packages. By using the -underlying technologies and handing over all evaluation of code to the -backend, this package can deal with all sizes of data the backends can -deal with. +of input dataset as well as the available packages (see also +`?detect_backend(data)`). By using the underlying technologies and +handing over all evaluation of code to the backend, this package can +deal with all sizes of data the backends can deal with. + +The package also has a helper function to describe a dataset, see +`?describe()`. ## Installation @@ -65,6 +68,9 @@ a rule like this will work `mpg > 10 * wt`, whereas a rule like this ``` r library(dataverifyr) +# create a dataset +data <- mtcars + # define a rule set within our R code; alternatively in a yaml file rules <- ruleset( rule(mpg > 10 & mpg < 30), # mpg goes up to 34 @@ -79,13 +85,40 @@ rules #> [2] 'Rule for: cyl' matching `cyl %in% c(4, 8)` (allow_na: FALSE) #> [3] 'Rule for: vs' matching `vs %in% c(0, 1)` (allow_na: TRUE) +# describe the dataset +describe(data) +#> var type n n_distinct n_na most_frequent min mean median +#> 1: mpg numeric 32 25 0 21 (2), 22.8 (2), 21.4 (2) 10.400 20.090625 19.200 +#> 2: cyl numeric 32 3 0 8 (14), 4 (11), 6 (7) 4.000 6.187500 6.000 +#> 3: disp numeric 32 27 0 275.8 (3), 160 (2), 360 (2) 71.100 230.721875 196.300 +#> 4: hp numeric 32 22 0 110 (3), 175 (3), 180 (3) 52.000 146.687500 123.000 +#> 5: drat numeric 32 22 0 3.92 (3), 3.07 (3), 3.9 (2) 2.760 3.596563 3.695 +#> 6: wt numeric 32 29 0 3.44 (3), 3.57 (2), 2.62 (1) 1.513 3.217250 3.325 +#> 7: qsec numeric 32 30 0 17.02 (2), 18.9 (2), 16.46 (1) 14.500 17.848750 17.710 +#> 8: vs numeric 32 2 0 0 (18), 1 (14), NA (NA) 0.000 0.437500 0.000 +#> 9: am numeric 32 2 0 0 (19), 1 (13), NA (NA) 0.000 0.406250 0.000 +#> 10: gear numeric 32 3 0 3 (15), 4 (12), 5 (5) 3.000 3.687500 4.000 +#> 11: carb numeric 32 6 0 4 (10), 2 (10), 1 (7) 1.000 2.812500 2.000 +#> max sd +#> 1: 33.900 6.0269481 +#> 2: 8.000 1.7859216 +#> 3: 472.000 123.9386938 +#> 4: 335.000 68.5628685 +#> 5: 4.930 0.5346787 +#> 6: 5.424 0.9784574 +#> 7: 22.900 1.7869432 +#> 8: 1.000 0.5040161 +#> 9: 1.000 0.4989909 +#> 10: 5.000 0.7378041 +#> 11: 8.000 1.6152000 + # check if the data matches our rules -res <- check_data(mtcars, rules) +res <- check_data(data, rules) res #> name expr allow_na negate tests pass fail warn error time -#> 1: Rule for: mpg mpg > 10 & mpg < 30 FALSE FALSE 32 28 4 0.0055301189 secs -#> 2: Rule for: cyl cyl %in% c(4, 8) FALSE FALSE 32 25 7 0.0034749508 secs -#> 3: Rule for: vs vs %in% c(0, 1) TRUE FALSE 32 32 0 0.0004439354 secs +#> 1: Rule for: mpg mpg > 10 & mpg < 30 FALSE FALSE 32 28 4 0.0010831356 secs +#> 2: Rule for: cyl cyl %in% c(4, 8) FALSE FALSE 32 25 7 0.0033519268 secs +#> 3: Rule for: vs vs %in% c(0, 1) TRUE FALSE 32 32 0 0.0005369186 secs ``` As we can see, our dataset `mtcars` does not conform to all of our @@ -482,15 +515,31 @@ file.size(file) / 1e6 # in MB #> [1] 123.6685 # quick check of the filesize -d <- read_parquet(file) -dim(d) -#> [1] 8760687 19 -names(d) -#> [1] "VendorID" "tpep_pickup_datetime" "tpep_dropoff_datetime" "passenger_count" -#> [5] "trip_distance" "RatecodeID" "store_and_fwd_flag" "PULocationID" -#> [9] "DOLocationID" "payment_type" "fare_amount" "extra" -#> [13] "mta_tax" "tip_amount" "tolls_amount" "improvement_surcharge" -#> [17] "total_amount" "congestion_surcharge" "airport_fee" +d <- open_dataset(file) +describe(d) +#> # A tibble: 19 × 10 +#> var type n_dis…¹ n_na most_…² min mean median max sd +#> +#> 1 VendorID inte… 2 0 2 (491… 1 1.5610 2 2 0.49508 +#> 2 tpep_pickup_datetime POSI… 2311532 0 2018-0… NA NA NA NA NA +#> 3 tpep_dropoff_dateti… POSI… 2315089 0 2018-0… NA NA NA NA NA +#> 4 passenger_count inte… 10 0 1 (624… 0 1.6068 1 9 1.2330 +#> 5 trip_distance nume… 4397 0 0.8 (2… 0 2.8040 1.5503 189484. 3.2532 +#> 6 RatecodeID inte… 7 0 1 (853… 1 1.0395 1 99 0.31301 +#> 7 store_and_fwd_flag char… 2 0 N (872… 1 1 1 1 0 +#> 8 PULocationID inte… 259 0 237 (3… 1 164.46 161.86 265 66.520 +#> 9 DOLocationID inte… 261 0 236 (3… 1 162.73 162.10 265 75.411 +#> 10 payment_type inte… 4 0 1 (610… 1 1.3106 1 4 0.45814 +#> 11 fare_amount nume… 1714 0 6 (473… -450 12.244 8.8357 8016 9.9255 +#> 12 extra nume… 42 0 0 (474… -44.69 0.32469 0 60 0.068786 +#> 13 mta_tax nume… 15 0 0.5 (8… -0.5 0.49751 0.5 45.49 0.043389 +#> 14 tip_amount nume… 3397 0 0 (289… -88.8 1.8188 1.3968 441.71 2.2823 +#> 15 tolls_amount nume… 967 0 0 (833… -15 0.30262 0 950.7 1.1501 +#> 16 improvement_surchar… nume… 4 0 0.3 (8… -0.3 0.29963 0.3 1 0.018442 +#> 17 total_amount nume… 11514 0 7.3 (2… -450.3 15.491 11.321 8016.8 11.984 +#> 18 congestion_surcharge nume… 2 8760675 NA (87… 2.5 2.5 2.5 2.5 NA +#> 19 airport_fee nume… 2 8760675 NA (87… 0 0 0 0 NA +#> # … with abbreviated variable names ¹​n_distinct, ²​most_frequent # write the dataset to disk write_dataset(d, "nyc-taxi-data") @@ -551,9 +600,9 @@ res #> # A tibble: 3 × 10 #> name expr allow…¹ negate tests pass fail warn error time #> -#> 1 Rule for: passenger_count passenger_count … FALSE FALSE 8760687 8760687 0 "" "" 0.56… -#> 2 Rule for: trip_distance trip_distance >=… FALSE FALSE 8760687 8760686 1 "" "" 0.43… -#> 3 Rule for: payment_type payment_type %in… FALSE FALSE 8760687 8760687 0 "" "" 0.42… +#> 1 Rule for: passenger_count passenger_count … FALSE FALSE 8760687 8760687 0 "" "" 0.42… +#> 2 Rule for: trip_distance trip_distance >=… FALSE FALSE 8760687 8760686 1 "" "" 0.51… +#> 3 Rule for: payment_type payment_type %in… FALSE FALSE 8760687 8760687 0 "" "" 0.45… #> # … with abbreviated variable name ¹​allow_na plot_res(res) @@ -611,9 +660,9 @@ res #> # A tibble: 3 × 10 #> name expr allow_na negate tests pass fail warn error time #> -#> 1 Rule for: mpg mpg > 10 & mpg < 30 FALSE FALSE 32 28 4 "" "" 1.232728 secs -#> 2 Rule for: cyl cyl %in% c(4, 8) FALSE FALSE 32 25 7 "" "" 0.2015200 secs -#> 3 Rule for: vs vs %in% c(0, 1) TRUE FALSE 32 32 0 "" "" 0.1898661 secs +#> 1 Rule for: mpg mpg > 10 & mpg < 30 FALSE FALSE 32 28 4 "" "" 4.4900761 secs +#> 2 Rule for: cyl cyl %in% c(4, 8) FALSE FALSE 32 25 7 "" "" 0.1926301 secs +#> 3 Rule for: vs vs %in% c(0, 1) TRUE FALSE 32 32 0 "" "" 0.2003391 secs filter_fails(res, tbl, per_rule = TRUE) #> $`mpg > 10 & mpg < 30` diff --git a/man/describe.Rd b/man/describe.Rd new file mode 100644 index 0000000..68ba49f --- /dev/null +++ b/man/describe.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/describe.R +\name{describe} +\alias{describe} +\title{Describes a dataset} +\usage{ +describe(x) +} +\arguments{ +\item{x}{a dataset, either a \code{\link{data.frame}}, \code{\link[dplyr:reexports]{dplyr::tibble}}, \code{\link[data.table:data.table]{data.table::data.table}}, +\code{\link[arrow:Table]{arrow::arrow_table}}, \code{\link[arrow:open_dataset]{arrow::open_dataset}}, or \code{\link[dplyr:tbl]{dplyr::tbl}} (SQL connection)} +} +\value{ +a \code{data.frame}, \code{dplyr::tibble}, or \code{data.table::data.table} containing +a summary of the dataset given +} +\description{ +Note that the current version is in the beta stadium at best, that means the +R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster +than arrow or SQL-based datasets. +} +\examples{ +describe(mtcars) +} +\seealso{ +Similar to \href{https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html}{skimr::skim()}, +\href{https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary}{summarytools::dfSummary()}, +and \href{https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html}{gtExtras::gt_plt_summary()} +} diff --git a/tests/testthat/test-describe.R b/tests/testthat/test-describe.R new file mode 100644 index 0000000..fb6f382 --- /dev/null +++ b/tests/testthat/test-describe.R @@ -0,0 +1,133 @@ + +# create a sample dataset to describe +set.seed(41) +data <- data.frame( + # numeric + a = as.numeric(seq(20)), + b = rnorm(20), + # integer + c = as.integer(seq(20)), + d = sample.int(20, 10, replace = TRUE), + # character + e = sample(letters, 20, replace = TRUE), + f = sample(c(letters, paste0(letters, letters), paste0(letters, letters, letters)), + 20, replace = TRUE), + # factor + g = factor(sample(letters, 20, replace = TRUE)), + # posixct + h = as.POSIXct(sample.int(365*60*60*24, 10, replace = TRUE), origin = "2020-01-01") +) +for (n in names(data)) + data[sample.int(nrow(data), 3), n] <- NA + +# the expected values +exp <- data.frame( + var = c("a", "b", "c", "d", "e", "f", "g", "h"), + type = c("numeric", "numeric", "integer", "integer", "character", + "character", "factor", "POSIXct"), + n = c(20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L), + n_distinct = c(18L, 18L, 18L, 8L, 14L, 15L, 13L, 11L), + n_na = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), + most_frequent = c("NA (3), 1 (1), 2 (1)", + "NA (3), -0.794368337075183 (1), 0.197257539137862 (1)", + "NA (3), 1 (1), 2 (1)", "16 (4), NA (3), 13 (3)", + "u (3), NA (3), r (2)", "NA (3), ggg (3), x (2)", + "g (3), NA (3), b (2)", + "NA (3), 2020-11-16 01:27:37 (2), 2020-08-10 23:06:51 (2)"), + min = c(1, -1.57960700723589, 1, 7, 1, 1, 1, 1578213277), + mean = c(9.82352941176471, 0.395917134682771, 9.70588235294118, + 13.4705882352941, 1, 2.11764705882353, 1, 1597372702.64706), + median = c(10, 0.493667470676325, 10, 13, 1, 2, 1, 1601657240), + max = c(20, 2.27440247098253, 20, 18, 1, 3, 1, 1606208141), + sd = c(5.92911559717854, 1.08100890056568, 5.7961701351232, + 3.6420743927538, 0, 0.857492925712544, 0, 9824874.35812236) +) + +test_that("describe data.frame", { + local_mocked_bindings(has_pkg = function(p) p %in% pkgs, .package = "dataverifyr") + pkgs <- NULL + expect_equal(detect_backend(data), "base-r") + d <- describe(data) + expect_equal(class(d), "data.frame") + expect_equal(d, exp) + + skip_if_not(requireNamespace("dplyr", quietly = TRUE) | + requireNamespace("data.table", quietly = TRUE), + "dplyr and data.table must be installed to run these tests") + + # use data.table + pkgs <- "data.table" + expect_equal(detect_backend(data), "data.table") + d <- describe(data) + expect_equal(class(d), c("data.table", "data.frame")) + expect_equal(as.data.frame(d), exp) + + # use dplyr + pkgs <- "dplyr" + expect_equal(detect_backend(data), "dplyr") + d <- describe(data) + expect_equal(class(d), c("tbl_df", "tbl", "data.frame")) + expect_equal(as.data.frame(d), exp) + # error here as dplyr backend sorts the data slightly differently... +}) + +test_that("sqlite", { + skip_if_not(requireNamespace("DBI", quietly = TRUE) | + requireNamespace("dplyr", quietly = TRUE) | + requireNamespace("dbplyr", quietly = TRUE) | + requireNamespace("RSQLite", quietly = TRUE), + "DBI, dplyr, dbplyr, and RSQLite must be installed to test the functionality") + + con <- DBI::dbConnect(RSQLite::SQLite()) + DBI::dbWriteTable(con, "data", data) + x <- dplyr::tbl(con, "data") + expect_equal(detect_backend(x), "collectibles") + + d <- describe(x) + # this takes waaaaay too long.... + + expect_equal(class(d), c("tbl_df", "tbl", "data.frame")) + expect_equal(as.data.frame(d), exp) + # error here: different order and missing last row due to posixct + + DBI::dbDisconnect(con) +}) + +test_that("duckdb", { + skip_if_not(requireNamespace("DBI", quietly = TRUE) | + requireNamespace("dplyr", quietly = TRUE) | + requireNamespace("dbplyr", quietly = TRUE) | + requireNamespace("duckdb", quietly = TRUE), + "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality") + + con <- DBI::dbConnect(duckdb::duckdb()) + DBI::dbWriteTable(con, "data", data) + x <- dplyr::tbl(con, "data") + expect_equal(detect_backend(x), "collectibles") + + d <- describe(x) + # this takes waaaaay too long.... + + expect_equal(class(d), c("tbl_df", "tbl", "data.frame")) + expect_equal(as.data.frame(d), exp) + # error here: different order and missing last row due to posixct + DBI::dbDisconnect(con) + +}) +test_that("arrow", { + skip_if_not(requireNamespace("dbplyr", quietly = TRUE) | + requireNamespace("arrow", quietly = TRUE), + "dplyr and arrow must be installed to test the functionality") + + tmp <- tempfile() + arrow::write_parquet(data, tmp) + x <- arrow::open_dataset(tmp) + expect_equal(detect_backend(x), "collectibles") + + d <- describe(x) + # this takes waaaaay too long.... + + expect_equal(class(d), c("tbl_df", "tbl", "data.frame")) + expect_equal(as.data.frame(d), exp) + # error here: different order and missing last row due to posixct +})