From febca9e96849df49d1d68ea6bb0bfe533f69530f Mon Sep 17 00:00:00 2001
From: DavZim <david-zimmermann@hotmail.de>
Date: Mon, 23 Oct 2023 15:42:54 +0200
Subject: [PATCH] add first draft of describe (#5)

---
 NAMESPACE                      |   1 +
 R/check_data.R                 |   8 +-
 R/describe.R                   | 276 +++++++++++++++++++++++++++++++++
 README.Rmd                     |  16 +-
 README.md                      |  95 +++++++++---
 man/describe.Rd                |  29 ++++
 tests/testthat/test-describe.R | 133 ++++++++++++++++
 7 files changed, 526 insertions(+), 32 deletions(-)
 create mode 100644 R/describe.R
 create mode 100644 man/describe.Rd
 create mode 100644 tests/testthat/test-describe.R

diff --git a/NAMESPACE b/NAMESPACE
index 5580940..bb437b8 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,7 @@
 S3method(print,rule)
 S3method(print,ruleset)
 export(check_data)
+export(describe)
 export(detect_backend)
 export(filter_fails)
 export(plot_res)
diff --git a/R/check_data.R b/R/check_data.R
index 89dd000..f1b3559 100644
--- a/R/check_data.R
+++ b/R/check_data.R
@@ -103,15 +103,15 @@ detect_backend <- function(x) {
 
   } else if ("tbl_sql" %in% cc) {
 
-    if (!has_pkg("DBI"))
-      stop("The DBI package needs to be installed in order to test a tbl_sql.")
+    if (!has_pkg("DBI") || !has_pkg("dbplyr"))
+      stop("The DBI and dbplyr packages need to be installed in order to test a tbl_sql.")
 
     backend <- "collectibles"
 
   } else if ("ArrowObject" %in% cc) {
 
-    if (!has_pkg("arrow"))
-      stop("The arrow package needs to be installed in order to test an ArrowObject.")
+    if (!has_pkg("arrow") || !has_pkg("dbplyr"))
+      stop("The arrow and dbplyr packages need to be installed in order to test an ArrowObject.")
 
     backend <- "collectibles"
 
diff --git a/R/describe.R b/R/describe.R
new file mode 100644
index 0000000..44ff9e9
--- /dev/null
+++ b/R/describe.R
@@ -0,0 +1,276 @@
+#' Describes a dataset
+#'
+#' Note that the current version is in the beta stadium at best, that means the
+#' R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster
+#' than arrow or SQL-based datasets.
+#'
+#' @param x a dataset, either a [`data.frame`], [`dplyr::tibble`], [`data.table::data.table`],
+#' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection)
+#'
+#' @return a `data.frame`, `dplyr::tibble`, or `data.table::data.table` containing
+#' a summary of the dataset given
+#' @export
+#'
+#' @seealso Similar to [skimr::skim()](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html),
+#'   [summarytools::dfSummary()](https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary),
+#'   and [gtExtras::gt_plt_summary()](https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html)
+#'
+#' @examples
+#' describe(mtcars)
+describe <- function(x) {
+
+  backend <- detect_backend(x)
+
+  # make sure the input dataset has the right class
+  if (class(x)[[1]] == "data.frame") {
+    if (backend == "data.table") {
+      x <- data.table::as.data.table(x)
+    } else if (backend == "dplyr") {
+      x <- dplyr::as_tibble(x)
+    }
+  }
+
+  if (backend == "base-r") {
+    describe_base_r(x)
+  } else if (backend == "dplyr") {
+    describe_dplyr(x)
+  } else if (backend == "data.table") {
+    describe_data.table(x)
+  } else if (backend == "collectibles") {
+    if ("tbl_sql" %in% class(x)) {
+      describe_sql(x)
+    } else if ("ArrowObject" %in% class(x)) {
+      describe_arrow(x)
+    }
+  } else {
+    stop(sprintf("Could not detect backend to describe %s", paste(class(x), collapse = ", ")))
+  }
+}
+
+
+# internal function to see which values should use the min/max etc part
+is_numeric <- function(v) {
+  any(class(v) %in% c("integer", "numeric", "POSIXt"))
+}
+
+# x <- mtcars
+describe_base_r <- function(x, max_n = 3) {
+  ll <- lapply(
+    seq(ncol(x)),
+    function(i) {
+      v <- x[[i]]
+      type <- class(v)[[1]]
+      is_num <- is_numeric(v)
+
+      tbl <- table(v)
+      uv <- unique(v)
+      tab <- tabulate(match(v, uv))
+      tab_max <- which(tab == max(tab))
+      # get the indices of the three highest counts
+      od <- order(tab, decreasing = TRUE)[seq(min(max_n, length(tab)))]
+
+      nz <- if (!is_num) nchar(as.character(v))
+
+      data.frame(
+        var = names(x)[[i]],
+        type = type,
+        n = length(v),
+        n_distinct = length(unique(v)),
+        n_na = sum(is.na(v)),
+        most_frequent = paste(sprintf("%s (%s)", uv[od], tab[od]),
+                              collapse = ", "),
+
+        min = as.numeric(min(if (is_num) v else nz, na.rm = TRUE)),
+        mean = as.numeric(mean(if (is_num) v else nz, na.rm = TRUE)),
+        median = as.numeric(median(if (is_num) v else nz, na.rm = TRUE)),
+        max = as.numeric(max(if (is_num) v else nz, na.rm = TRUE)),
+        sd = as.numeric(sd(if (is_num) v else nz, na.rm = TRUE))
+      )
+    }
+  )
+
+  do.call(rbind, ll)
+}
+
+# x <- mtcars |> tibble::as_tibble()
+describe_dplyr <- function(x, max_n = 3) {
+  ll <- lapply(
+    names(x),
+    function(v) {
+      mc <- x |>
+        dplyr::count(.data[[v]]) |>
+        dplyr::slice_max(n, n = max_n, with_ties = FALSE)
+
+      type <- class(mc[[1]])[[1]]
+      is_num <- is_numeric(mc[[1]])
+      mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
+
+      nz <- if (!is_num) nchar(as.character(x[[v]]))
+      x |>
+        dplyr::summarise(
+          var = v,
+          type = type,
+          n = dplyr::n(),
+          n_distinct = dplyr::n_distinct(.data[[v]]),
+          n_na = sum(is.na(.data[[v]])),
+          most_frequent = mf,
+          min = as.numeric(min(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+          mean = as.numeric(mean(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+          median = as.numeric(median(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+          max = as.numeric(max(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+          sd = as.numeric(sd(if (is_num) .data[[v]] else nz, na.rm = TRUE))
+        )
+    }
+  )
+
+  dplyr::bind_rows(ll)
+}
+
+# x <- mtcars |> data.table::as.data.table()
+describe_data.table <- function(x, max_n = 3) {
+  ll <- lapply(
+    names(x),
+    function(v) {
+      mc <- x[, .(n = .N), by = v][order(n, decreasing = TRUE)][seq(max_n)]
+
+      type <- class(mc[[1]])[[1]]
+      is_num <- is_numeric(mc[[1]])
+      mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
+
+      nz <- if (!is_num) nchar(as.character(x[[v]]))
+
+      x[, .(
+        var = v,
+        type = type,
+        n = .N,
+        n_distinct = data.table::uniqueN(get(v)),
+        n_na = sum(is.na(get(v))),
+        most_frequent = mf,
+
+        min = as.numeric(min(if (is_num) get(v) else nz, na.rm = TRUE)),
+        mean = as.numeric(mean(if (is_num) get(v) else nz, na.rm = TRUE)),
+        median = as.numeric(median(if (is_num) get(v) else nz, na.rm = TRUE)),
+        max = as.numeric(max(if (is_num) get(v) else nz, na.rm = TRUE)),
+        sd = as.numeric(sd(if (is_num) get(v) else nz, na.rm = TRUE))
+      )]
+    }
+  )
+
+  data.table::rbindlist(ll)
+}
+
+
+# RSQLite, duckdb etc
+describe_sql <- function(x, max_n = 3) {
+  ll <- lapply(names(x), function(v) {
+    mc <- x |>
+      dplyr::count(.data[[v]]) |>
+      dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
+      dplyr::collect()
+
+    type <- class(mc[[1]])[[1]]
+    is_num <- is_numeric(mc[[1]])
+    mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
+    nn <- x |>
+      dplyr::distinct(.data[[v]]) |>
+      dplyr::summarise(n = dplyr::n()) |>
+      dplyr::collect()
+    nna <- x |> dplyr::filter(is.na(.data[[v]])) |> dplyr::collect() |> nrow()
+
+    r <- dplyr::tibble(
+      var = v,
+      type = type,
+      n_distinct = nn[[1]],
+      n_na = nna,
+      most_frequent = mf
+    )
+
+    xx <- x |>
+      dplyr::select(dplyr::all_of(v)) |>
+      dplyr::rename(x := dplyr::all_of(v))
+    if (!is_num) xx <- xx |> dplyr::mutate(x = nchar(as.character(x)))
+
+
+    rr <- try(
+      xx |>
+        dplyr::summarise(
+          min = min(x, na.rm = TRUE),
+          mean = mean(x, na.rm = TRUE),
+          median = median(x, na.rm = TRUE),
+          max = max(x, na.rm = TRUE),
+          sd = sd(x, na.rm = TRUE)
+        ) |>
+        dplyr::collect(),
+      silent = TRUE
+    )
+    if (inherits(rr, "try-error")) {
+      rr <- dplyr::tibble(
+        min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_,
+        sd = NA_real_
+      )
+    }
+
+    dplyr::bind_cols(r, rr)
+  })
+
+  dplyr::bind_rows(ll)
+}
+
+# arrow::write_parquet(nycflights13::flights, "flights.parquet")
+# x <- arrow::open_dataset("flights.parquet")
+describe_arrow <- function(x, max_n = 3) {
+  # if x is a dbplyr connection string
+  ll <- lapply(names(x), function(v) {
+    mc <- x |>
+      dplyr::count(.data[[v]]) |>
+      dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
+      dplyr::collect()
+
+    type <- class(mc[[1]])[[1]]
+    is_num <- is_numeric(mc[[1]])
+    mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
+    nn <- x |>
+      dplyr::distinct(.data[[v]]) |>
+      dplyr::summarise(n = dplyr::n()) |>
+      dplyr::collect()
+    nna <- x |> dplyr::filter(is.na(.data[[v]])) |> nrow()
+
+    r <- dplyr::tibble(
+      var = v,
+      type = type,
+      n_distinct = nn[[1]],
+      n_na = nna,
+      most_frequent = mf
+    )
+
+    if (is_num) {
+      xx <- x |> dplyr::transmute(x = get(v))
+    } else {
+      xx <- x |> dplyr::transmute(x = nchar(as.character(get(v))))
+    }
+
+    suppressWarnings({
+      rr <- try(
+        xx |>
+          dplyr::summarise(
+            min = min(x, na.rm = TRUE),
+            mean = mean(x, na.rm = TRUE),
+            median = median(x, na.rm = TRUE),
+            max = max(x, na.rm = TRUE),
+            sd = sd(x, na.rm = TRUE)
+          ) |>
+          dplyr::collect(),
+        silent = TRUE)
+    })
+    if (inherits(rr, "try-error")) {
+      rr <- dplyr::tibble(
+        min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_,
+        sd = NA_real_
+      )
+    }
+
+    dplyr::bind_cols(r, rr)
+  })
+
+  dplyr::bind_rows(ll)
+}
diff --git a/README.Rmd b/README.Rmd
index 469b53f..5e8e4f8 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -32,9 +32,10 @@ That way, you can concentrate on writing the rules and making sure that your dat
 
 The package is lightweight as all the heavy dependencies are Suggests-only, that means if you want to use `data.table` for the task, you don't need to install the other packages (`arrow`, `DBI`, etc) unless you explicitly tell R to install all suggested packages as well when installing the package.
 
-The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages.
+The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages (see also `?detect_backend(data)`).
 By using the underlying technologies and handing over all evaluation of code to the backend, this package can deal with all sizes of data the backends can deal with.
 
+The package also has a helper function to describe a dataset, see `?describe()`.
 
 ## Installation
 
@@ -63,6 +64,9 @@ At the moment rules work in a window/vectorized approach only, that means that a
 ```{r example, message=FALSE}
 library(dataverifyr)
 
+# create a dataset
+data <- mtcars
+
 # define a rule set within our R code; alternatively in a yaml file
 rules <- ruleset(
   rule(mpg > 10 & mpg < 30), # mpg goes up to 34
@@ -73,8 +77,11 @@ rules <- ruleset(
 # print the rules
 rules
 
+# describe the dataset
+describe(data)
+
 # check if the data matches our rules
-res <- check_data(mtcars, rules)
+res <- check_data(data, rules)
 res
 ```
 
@@ -206,9 +213,8 @@ if (!file.exists(file)) download.file(url, file, method = "curl")
 file.size(file) / 1e6 # in MB
 
 # quick check of the filesize
-d <- read_parquet(file)
-dim(d)
-names(d)
+d <- open_dataset(file)
+describe(d)
 
 # write the dataset to disk
 write_dataset(d, "nyc-taxi-data")
diff --git a/README.md b/README.md
index f84e3c4..daf3cd9 100644
--- a/README.md
+++ b/README.md
@@ -30,10 +30,13 @@ unless you explicitly tell R to install all suggested packages as well
 when installing the package.
 
 The backend for your analysis is automatically chosen based on the type
-of input dataset as well as the available packages. By using the
-underlying technologies and handing over all evaluation of code to the
-backend, this package can deal with all sizes of data the backends can
-deal with.
+of input dataset as well as the available packages (see also
+`?detect_backend(data)`). By using the underlying technologies and
+handing over all evaluation of code to the backend, this package can
+deal with all sizes of data the backends can deal with.
+
+The package also has a helper function to describe a dataset, see
+`?describe()`.
 
 ## Installation
 
@@ -65,6 +68,9 @@ a rule like this will work `mpg > 10 * wt`, whereas a rule like this
 ``` r
 library(dataverifyr)
 
+# create a dataset
+data <- mtcars
+
 # define a rule set within our R code; alternatively in a yaml file
 rules <- ruleset(
   rule(mpg > 10 & mpg < 30), # mpg goes up to 34
@@ -79,13 +85,40 @@ rules
 #>   [2] 'Rule for: cyl' matching `cyl %in% c(4, 8)` (allow_na: FALSE)
 #>   [3] 'Rule for: vs' matching `vs %in% c(0, 1)` (allow_na: TRUE)
 
+# describe the dataset
+describe(data)
+#>      var    type  n n_distinct n_na                  most_frequent    min       mean  median
+#>  1:  mpg numeric 32         25    0     21 (2), 22.8 (2), 21.4 (2) 10.400  20.090625  19.200
+#>  2:  cyl numeric 32          3    0          8 (14), 4 (11), 6 (7)  4.000   6.187500   6.000
+#>  3: disp numeric 32         27    0    275.8 (3), 160 (2), 360 (2) 71.100 230.721875 196.300
+#>  4:   hp numeric 32         22    0      110 (3), 175 (3), 180 (3) 52.000 146.687500 123.000
+#>  5: drat numeric 32         22    0    3.92 (3), 3.07 (3), 3.9 (2)  2.760   3.596563   3.695
+#>  6:   wt numeric 32         29    0   3.44 (3), 3.57 (2), 2.62 (1)  1.513   3.217250   3.325
+#>  7: qsec numeric 32         30    0 17.02 (2), 18.9 (2), 16.46 (1) 14.500  17.848750  17.710
+#>  8:   vs numeric 32          2    0        0 (18), 1 (14), NA (NA)  0.000   0.437500   0.000
+#>  9:   am numeric 32          2    0        0 (19), 1 (13), NA (NA)  0.000   0.406250   0.000
+#> 10: gear numeric 32          3    0          3 (15), 4 (12), 5 (5)  3.000   3.687500   4.000
+#> 11: carb numeric 32          6    0          4 (10), 2 (10), 1 (7)  1.000   2.812500   2.000
+#>         max          sd
+#>  1:  33.900   6.0269481
+#>  2:   8.000   1.7859216
+#>  3: 472.000 123.9386938
+#>  4: 335.000  68.5628685
+#>  5:   4.930   0.5346787
+#>  6:   5.424   0.9784574
+#>  7:  22.900   1.7869432
+#>  8:   1.000   0.5040161
+#>  9:   1.000   0.4989909
+#> 10:   5.000   0.7378041
+#> 11:   8.000   1.6152000
+
 # check if the data matches our rules
-res <- check_data(mtcars, rules)
+res <- check_data(data, rules)
 res
 #>             name                expr allow_na negate tests pass fail warn error              time
-#> 1: Rule for: mpg mpg > 10 & mpg < 30    FALSE  FALSE    32   28    4            0.0055301189 secs
-#> 2: Rule for: cyl    cyl %in% c(4, 8)    FALSE  FALSE    32   25    7            0.0034749508 secs
-#> 3:  Rule for: vs     vs %in% c(0, 1)     TRUE  FALSE    32   32    0            0.0004439354 secs
+#> 1: Rule for: mpg mpg > 10 & mpg < 30    FALSE  FALSE    32   28    4            0.0010831356 secs
+#> 2: Rule for: cyl    cyl %in% c(4, 8)    FALSE  FALSE    32   25    7            0.0033519268 secs
+#> 3:  Rule for: vs     vs %in% c(0, 1)     TRUE  FALSE    32   32    0            0.0005369186 secs
 ```
 
 As we can see, our dataset `mtcars` does not conform to all of our
@@ -482,15 +515,31 @@ file.size(file) / 1e6 # in MB
 #> [1] 123.6685
 
 # quick check of the filesize
-d <- read_parquet(file)
-dim(d)
-#> [1] 8760687      19
-names(d)
-#>  [1] "VendorID"              "tpep_pickup_datetime"  "tpep_dropoff_datetime" "passenger_count"      
-#>  [5] "trip_distance"         "RatecodeID"            "store_and_fwd_flag"    "PULocationID"         
-#>  [9] "DOLocationID"          "payment_type"          "fare_amount"           "extra"                
-#> [13] "mta_tax"               "tip_amount"            "tolls_amount"          "improvement_surcharge"
-#> [17] "total_amount"          "congestion_surcharge"  "airport_fee"
+d <- open_dataset(file)
+describe(d)
+#> # A tibble: 19 × 10
+#>    var                  type  n_dis…¹    n_na most_…²     min      mean   median       max        sd
+#>    <chr>                <chr>   <int>   <int> <chr>     <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
+#>  1 VendorID             inte…       2       0 2 (491…    1      1.5610    2           2     0.49508 
+#>  2 tpep_pickup_datetime POSI… 2311532       0 2018-0…   NA     NA        NA          NA    NA       
+#>  3 tpep_dropoff_dateti… POSI… 2315089       0 2018-0…   NA     NA        NA          NA    NA       
+#>  4 passenger_count      inte…      10       0 1 (624…    0      1.6068    1           9     1.2330  
+#>  5 trip_distance        nume…    4397       0 0.8 (2…    0      2.8040    1.5503 189484.    3.2532  
+#>  6 RatecodeID           inte…       7       0 1 (853…    1      1.0395    1          99     0.31301 
+#>  7 store_and_fwd_flag   char…       2       0 N (872…    1      1         1           1     0       
+#>  8 PULocationID         inte…     259       0 237 (3…    1    164.46    161.86      265    66.520   
+#>  9 DOLocationID         inte…     261       0 236 (3…    1    162.73    162.10      265    75.411   
+#> 10 payment_type         inte…       4       0 1 (610…    1      1.3106    1           4     0.45814 
+#> 11 fare_amount          nume…    1714       0 6 (473… -450     12.244     8.8357   8016     9.9255  
+#> 12 extra                nume…      42       0 0 (474…  -44.69   0.32469   0          60     0.068786
+#> 13 mta_tax              nume…      15       0 0.5 (8…   -0.5    0.49751   0.5        45.49  0.043389
+#> 14 tip_amount           nume…    3397       0 0 (289…  -88.8    1.8188    1.3968    441.71  2.2823  
+#> 15 tolls_amount         nume…     967       0 0 (833…  -15      0.30262   0         950.7   1.1501  
+#> 16 improvement_surchar… nume…       4       0 0.3 (8…   -0.3    0.29963   0.3         1     0.018442
+#> 17 total_amount         nume…   11514       0 7.3 (2… -450.3   15.491    11.321    8016.8  11.984   
+#> 18 congestion_surcharge nume…       2 8760675 NA (87…    2.5    2.5       2.5         2.5  NA       
+#> 19 airport_fee          nume…       2 8760675 NA (87…    0      0         0           0    NA       
+#> # … with abbreviated variable names ¹​n_distinct, ²​most_frequent
 
 # write the dataset to disk
 write_dataset(d, "nyc-taxi-data")
@@ -551,9 +600,9 @@ res
 #> # A tibble: 3 × 10
 #>   name                      expr              allow…¹ negate   tests    pass  fail warn  error time 
 #>   <chr>                     <chr>             <lgl>   <lgl>    <int>   <int> <int> <chr> <chr> <drt>
-#> 1 Rule for: passenger_count passenger_count … FALSE   FALSE  8760687 8760687     0 ""    ""    0.56…
-#> 2 Rule for: trip_distance   trip_distance >=… FALSE   FALSE  8760687 8760686     1 ""    ""    0.43…
-#> 3 Rule for: payment_type    payment_type %in… FALSE   FALSE  8760687 8760687     0 ""    ""    0.42…
+#> 1 Rule for: passenger_count passenger_count … FALSE   FALSE  8760687 8760687     0 ""    ""    0.42…
+#> 2 Rule for: trip_distance   trip_distance >=… FALSE   FALSE  8760687 8760686     1 ""    ""    0.51…
+#> 3 Rule for: payment_type    payment_type %in… FALSE   FALSE  8760687 8760687     0 ""    ""    0.45…
 #> # … with abbreviated variable name ¹​allow_na
 
 plot_res(res)
@@ -611,9 +660,9 @@ res
 #> # A tibble: 3 × 10
 #>   name          expr                allow_na negate tests  pass  fail warn  error time          
 #>   <chr>         <chr>               <lgl>    <lgl>  <dbl> <dbl> <dbl> <chr> <chr> <drtn>        
-#> 1 Rule for: mpg mpg > 10 & mpg < 30 FALSE    FALSE     32    28     4 ""    ""    1.232728 secs
-#> 2 Rule for: cyl cyl %in% c(4, 8)    FALSE    FALSE     32    25     7 ""    ""    0.2015200 secs
-#> 3 Rule for: vs  vs %in% c(0, 1)     TRUE     FALSE     32    32     0 ""    ""    0.1898661 secs
+#> 1 Rule for: mpg mpg > 10 & mpg < 30 FALSE    FALSE     32    28     4 ""    ""    4.4900761 secs
+#> 2 Rule for: cyl cyl %in% c(4, 8)    FALSE    FALSE     32    25     7 ""    ""    0.1926301 secs
+#> 3 Rule for: vs  vs %in% c(0, 1)     TRUE     FALSE     32    32     0 ""    ""    0.2003391 secs
 
 filter_fails(res, tbl, per_rule = TRUE)
 #> $`mpg > 10 & mpg < 30`
diff --git a/man/describe.Rd b/man/describe.Rd
new file mode 100644
index 0000000..68ba49f
--- /dev/null
+++ b/man/describe.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/describe.R
+\name{describe}
+\alias{describe}
+\title{Describes a dataset}
+\usage{
+describe(x)
+}
+\arguments{
+\item{x}{a dataset, either a \code{\link{data.frame}}, \code{\link[dplyr:reexports]{dplyr::tibble}}, \code{\link[data.table:data.table]{data.table::data.table}},
+\code{\link[arrow:Table]{arrow::arrow_table}}, \code{\link[arrow:open_dataset]{arrow::open_dataset}}, or \code{\link[dplyr:tbl]{dplyr::tbl}} (SQL connection)}
+}
+\value{
+a \code{data.frame}, \code{dplyr::tibble}, or \code{data.table::data.table} containing
+a summary of the dataset given
+}
+\description{
+Note that the current version is in the beta stadium at best, that means the
+R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster
+than arrow or SQL-based datasets.
+}
+\examples{
+describe(mtcars)
+}
+\seealso{
+Similar to \href{https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html}{skimr::skim()},
+\href{https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary}{summarytools::dfSummary()},
+and \href{https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html}{gtExtras::gt_plt_summary()}
+}
diff --git a/tests/testthat/test-describe.R b/tests/testthat/test-describe.R
new file mode 100644
index 0000000..fb6f382
--- /dev/null
+++ b/tests/testthat/test-describe.R
@@ -0,0 +1,133 @@
+
+# create a sample dataset to describe
+set.seed(41)
+data <- data.frame(
+  # numeric
+  a = as.numeric(seq(20)),
+  b = rnorm(20),
+  # integer
+  c = as.integer(seq(20)),
+  d = sample.int(20, 10, replace = TRUE),
+  # character
+  e = sample(letters, 20, replace = TRUE),
+  f = sample(c(letters, paste0(letters, letters), paste0(letters, letters, letters)),
+             20, replace = TRUE),
+  # factor
+  g = factor(sample(letters, 20, replace = TRUE)),
+  # posixct
+  h = as.POSIXct(sample.int(365*60*60*24, 10, replace = TRUE), origin = "2020-01-01")
+)
+for (n in names(data))
+  data[sample.int(nrow(data), 3), n] <- NA
+
+# the expected values
+exp <- data.frame(
+  var = c("a", "b", "c", "d", "e", "f", "g", "h"),
+  type = c("numeric", "numeric", "integer", "integer", "character",
+           "character", "factor", "POSIXct"),
+  n = c(20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L),
+  n_distinct = c(18L, 18L, 18L, 8L, 14L, 15L, 13L, 11L),
+  n_na = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L),
+  most_frequent = c("NA (3), 1 (1), 2 (1)",
+                    "NA (3), -0.794368337075183 (1), 0.197257539137862 (1)",
+                    "NA (3), 1 (1), 2 (1)", "16 (4), NA (3), 13 (3)",
+                    "u (3), NA (3), r (2)", "NA (3), ggg (3), x (2)",
+                    "g (3), NA (3), b (2)",
+                    "NA (3), 2020-11-16 01:27:37 (2), 2020-08-10 23:06:51 (2)"),
+  min = c(1, -1.57960700723589, 1, 7, 1, 1, 1, 1578213277),
+  mean = c(9.82352941176471, 0.395917134682771, 9.70588235294118,
+           13.4705882352941, 1, 2.11764705882353, 1, 1597372702.64706),
+  median = c(10, 0.493667470676325, 10, 13, 1, 2, 1, 1601657240),
+  max = c(20, 2.27440247098253, 20, 18, 1, 3, 1, 1606208141),
+  sd = c(5.92911559717854, 1.08100890056568, 5.7961701351232,
+         3.6420743927538, 0, 0.857492925712544, 0, 9824874.35812236)
+)
+
+test_that("describe data.frame", {
+  local_mocked_bindings(has_pkg = function(p) p %in% pkgs, .package = "dataverifyr")
+  pkgs <- NULL
+  expect_equal(detect_backend(data), "base-r")
+  d <- describe(data)
+  expect_equal(class(d), "data.frame")
+  expect_equal(d, exp)
+
+  skip_if_not(requireNamespace("dplyr", quietly = TRUE) |
+                requireNamespace("data.table", quietly = TRUE),
+              "dplyr and data.table must be installed to run these tests")
+
+  # use data.table
+  pkgs <- "data.table"
+  expect_equal(detect_backend(data), "data.table")
+  d <- describe(data)
+  expect_equal(class(d), c("data.table", "data.frame"))
+  expect_equal(as.data.frame(d), exp)
+
+  # use dplyr
+  pkgs <- "dplyr"
+  expect_equal(detect_backend(data), "dplyr")
+  d <- describe(data)
+  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
+  expect_equal(as.data.frame(d), exp)
+  # error here as dplyr backend sorts the data slightly differently...
+})
+
+test_that("sqlite", {
+  skip_if_not(requireNamespace("DBI", quietly = TRUE) |
+                requireNamespace("dplyr", quietly = TRUE) |
+                requireNamespace("dbplyr", quietly = TRUE) |
+                requireNamespace("RSQLite", quietly = TRUE),
+              "DBI, dplyr, dbplyr, and RSQLite must be installed to test the functionality")
+
+  con <- DBI::dbConnect(RSQLite::SQLite())
+  DBI::dbWriteTable(con, "data", data)
+  x <- dplyr::tbl(con, "data")
+  expect_equal(detect_backend(x), "collectibles")
+
+  d <- describe(x)
+  # this takes waaaaay too long....
+
+  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
+  expect_equal(as.data.frame(d), exp)
+  # error here: different order and missing last row due to posixct
+
+  DBI::dbDisconnect(con)
+})
+
+test_that("duckdb", {
+  skip_if_not(requireNamespace("DBI", quietly = TRUE) |
+                requireNamespace("dplyr", quietly = TRUE) |
+                requireNamespace("dbplyr", quietly = TRUE) |
+                requireNamespace("duckdb", quietly = TRUE),
+              "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality")
+
+  con <- DBI::dbConnect(duckdb::duckdb())
+  DBI::dbWriteTable(con, "data", data)
+  x <- dplyr::tbl(con, "data")
+  expect_equal(detect_backend(x), "collectibles")
+
+  d <- describe(x)
+  # this takes waaaaay too long....
+
+  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
+  expect_equal(as.data.frame(d), exp)
+  # error here: different order and missing last row due to posixct
+  DBI::dbDisconnect(con)
+
+})
+test_that("arrow", {
+  skip_if_not(requireNamespace("dbplyr", quietly = TRUE) |
+                requireNamespace("arrow", quietly = TRUE),
+              "dplyr and arrow must be installed to test the functionality")
+
+  tmp <- tempfile()
+  arrow::write_parquet(data, tmp)
+  x <- arrow::open_dataset(tmp)
+  expect_equal(detect_backend(x), "collectibles")
+
+  d <- describe(x)
+  # this takes waaaaay too long....
+
+  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
+  expect_equal(as.data.frame(d), exp)
+  # error here: different order and missing last row due to posixct
+})