From 601c5f344a5684ac3e483e339b7666662e42b13a Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Mon, 1 Jun 2020 15:43:34 +0200 Subject: [PATCH] v1.5.0 --- DESCRIPTION | 4 +- NEWS.md | 3 +- R/na_replace.R | 2 +- README.md | 43 ++++++++++++++++++++ docs/404.html | 4 +- docs/authors.html | 4 +- docs/index.html | 61 +++++++++++++++++++++++----- docs/news/index.html | 56 ++++++++++++------------- docs/pkgdown.yml | 6 +-- docs/reference/clean.html | 4 +- docs/reference/currency.html | 4 +- docs/reference/format_datetime.html | 10 ++--- docs/reference/format_names.html | 28 ++++++------- docs/reference/freq.html | 6 +-- docs/reference/index.html | 4 +- docs/reference/na_replace.html | 8 ++-- docs/reference/percentage.html | 4 +- docs/reference/rdate.html | 4 +- docs/reference/regex_true_false.html | 4 +- docs/reference/unclean.html | 4 +- 20 files changed, 174 insertions(+), 89 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9946a4d..1408419 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: cleaner Title: Fast and Easy Data Cleaning -Version: 1.4.0.9000 -Date: 2020-05-28 +Version: 1.5.0 +Date: 2020-06-01 Authors@R: person( given = c("Matthijs", "S."), diff --git a/NEWS.md b/NEWS.md index 19c7bd6..efb24ce 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# cleaner 1.4.0.9000 +# cleaner 1.5.0 * New function `format_names()` to quickly and easily change names of `data.frame` columns, `list`s or `character` vectors. ```r @@ -26,6 +26,7 @@ starwars %>% na_replace() # replace NAs in all columns ("" for hair_color and 0 for birth_year) ``` +* Support for the upcoming R 4.1.0 # cleaner 1.4.0 diff --git a/R/na_replace.R b/R/na_replace.R index 3c13dcf..746f519 100644 --- a/R/na_replace.R +++ b/R/na_replace.R @@ -138,7 +138,7 @@ na_replace.data.frame <- function(x, ..., replacement = NULL) { if (vctr_colname %in% colnames(attrbt$groups)) { attrbt$groups[which(is.na(attrbt$groups[, vctr_colname, drop = TRUE])), vctr_colname] <- replace_val # groups are always ordered on alphabet, so order it again with the new replacement value - attrbt$groups <- attrbt$groups[order(attrbt$groups[, vctr_colname]),] + attrbt$groups <- attrbt$groups[order(attrbt$groups[, vctr_colname]), ] } } } diff --git a/README.md b/README.md index 45d9d93..ff30e55 100644 --- a/README.md +++ b/README.md @@ -228,6 +228,49 @@ Use `clean()` to clean data. It guesses what kind of data class would best fit y #> 2 31.40 ``` +#### Other cleaning + +* Use `format_names()` to quickly and easily change names of `data.frame` columns, `list`s or `character` vectors. + ```r + format_names(df, snake_case = TRUE) + format_names(df, c(old.name = "new_name", value = "measurement")) + + library(dplyr) + starwars %>% + format_names(camelCase = TRUE) %>% # changes column names + mutate(name = name %>% + format_names(snake_case = TRUE)) # changes values in column + ``` + +* Use the generic function `na_replace()` to replace `NA` values in any data type. Its default replacement value is dependent on the data type that is given as input: `0` for numeric values and class `matrix`, `FALSE` for class `logical`, today for class `Date`, and `""` otherwise. + + ```r + na_replace(c(1, 2, NA, NA)) + #> [1] 1 2 0 0 + na_replace(c(1, 2, NA, NA), replacement = -1) + #> [1] 1 2 -1 -1 + na_replace(c(1, 2, NA, NA), replacement = c(0, -1)) + #> [1] 1 2 0 -1 + + na_replace(c("a", "b", NA, NA)) + #> [1] "a" "b" "" "" + ``` + + It also supports replacing `NA`s in complete data sets and supports grouped variables used by the `dplyr` package: + + ```r + library(dplyr) + starwars %>% + na_replace(hair_color) # only replace NAs in this column + + starwars %>% + na_replace() # replace NAs in all columns ("" for hair_color and 0 for birth_year) + + starwars %>% + group_by(hair_color) %>% + na_replace(hair_color, replacement = "TEST!") %>% + summarise(n = n()) + ``` ### Checking diff --git a/docs/404.html b/docs/404.html index 38d8aad..c11210d 100644 --- a/docs/404.html +++ b/docs/404.html @@ -73,7 +73,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -135,7 +135,7 @@

Contents

-

Site built with pkgdown 1.5.0.

+

Site built with pkgdown 1.5.1.

diff --git a/docs/authors.html b/docs/authors.html index f24d1db..8ecfa25 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -73,7 +73,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -134,7 +134,7 @@

Authors

-

Site built with pkgdown 1.5.0.

+

Site built with pkgdown 1.5.1.

diff --git a/docs/index.html b/docs/index.html index a8ecc8a..fc2f7a8 100644 --- a/docs/index.html +++ b/docs/index.html @@ -36,7 +36,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -116,7 +116,7 @@

Cleaning

-

Use clean() to clean data. It guesses what kind of data class would best fit your input data. It calls any of the following functions, that can also be used independently. They always return the class from the function name (e.g. clean_Date() always returns class Date).

+

Use clean() to clean data. It guesses what kind of data class would best fit your input data. It calls any of the following functions, that can also be used independently. They always return the class from the function name (e.g. clean_Date() always returns class Date).

  • clean_logical() for values TRUE/FALSE. You only define what should be TRUE or FALSE and it handles the rest for you. At default, it supports “Yes” and “No” in the following languages: Arabic, Bengali, Chinese (Mandarin), Dutch, English, French, German, Hindi, Indonesian, Japanese, Malay, Portuguese, Russian, Spanish, Telugu, Turkish and Urdu. This covers at least two-third of the world population (Ulrich Ammon et al., University of Düsseldorf).

    @@ -263,13 +263,54 @@

    #> 2 31.40

+
+

+Other cleaning

+
    +
  • +

    Use format_names() to quickly and easily change names of data.frame columns, lists or character vectors.

    +
    format_names(df, snake_case = TRUE)
    +format_names(df, c(old.name = "new_name", value = "measurement"))
    +
    +library(dplyr)
    +starwars %>%
    +  format_names(camelCase = TRUE) %>% # changes column names
    +  mutate(name = name %>%
    +           format_names(snake_case = TRUE)) # changes values in column
    +
  • +
  • +

    Use the generic function na_replace() to replace NA values in any data type. Its default replacement value is dependent on the data type that is given as input: 0 for numeric values and class matrix, FALSE for class logical, today for class Date, and "" otherwise.

    +
    na_replace(c(1, 2, NA, NA))
    +#> [1] 1 2 0 0
    +na_replace(c(1, 2, NA, NA), replacement = -1)
    +#> [1]  1  2 -1 -1
    +na_replace(c(1, 2, NA, NA), replacement = c(0, -1))
    +#> [1]  1  2  0 -1
    +
    +na_replace(c("a", "b", NA, NA))
    +#> [1] "a" "b" ""  ""
    +

    It also supports replacing NAs in complete data sets and supports grouped variables used by the dplyr package:

    +
    library(dplyr)
    +starwars %>%
    +  na_replace(hair_color) # only replace NAs in this column
    +
    +starwars %>%
    +  na_replace() # replace NAs in all columns ("" for hair_color and 0 for birth_year)
    +
    +starwars %>%
    +  group_by(hair_color) %>%
    +  na_replace(hair_color, replacement = "TEST!") %>%
    +  summarise(n = n())
    +
  • +
+

Checking

Any idea why in R as.numeric() and is.numeric() and as.Date() exist, but is.Date() doesn’t? Me neither, but now it does. And you probably know runif() to create random numeric values. Now rdate() exists as well, for generating random dates.

The easiest and most comprehensive way to check the data of a column/variable is to create frequency tables. Use freq() to do this. It supports a lot of different classes (types of data) and is even extendible by other packages.

-
freq(unclean$gender)
+
freq(unclean$gender)
 #> Frequency table 
 #> 
 #> Class:      character
@@ -288,7 +329,7 @@ 

#> 4 m 15 3.0% 497 99.4% #> 5 F 3 0.6% 500 100.0%

Clean it and check again:

-
freq(clean_factor(unclean$gender,
+
freq(clean_factor(unclean$gender,
                   levels = c("^m" = "Male", "^f" = "Female")))
 #> Frequency table 
 #> 
@@ -303,12 +344,12 @@ 

#> 1 Male 277 55.4% 277 55.4% #> 2 Female 223 44.6% 500 100.0%

This could also have been done with dplyr syntax, since freq() supports tidy evaluation:

-
unclean %>%
+
unclean %>%
   freq(clean_factor(gender,
                     levels = c("^m" = "Male", "^f" = "Female")))
 # or:
 unclean %>%
-  pull(gender) %>%
+  pull(gender) %>%
   clean_factor(c("^m" = "Male", "^f" = "Female")) %>%
   freq()
@@ -317,9 +358,9 @@

Speed

The cleaning functions are tremendously fast, because they rely on R’s own internal C++ libraries:

-
# Create a vector with 500,000 items
+
# Create a vector with 500,000 items
 n <- 500000
-values <- paste0(sample(c("yes", "no"), n, replace = TRUE),
+values <- paste0(sample(c("yes", "no"), n, replace = TRUE),
                  as.integer(runif(n, 0, 10000)))
 
 # data looks like:
@@ -352,7 +393,7 @@ 

Invalid regular expressions

If invalid regular expressions are used, the cleaning functions will not throw errors, but instead will show a warning and will interpret the expression as a fixed value:

-
clean_character("0123test 0123[a-b] ")
+
clean_character("0123test 0123[a-b] ")
 #> [1] "test ab"
 
 clean_character("0123test 0123[a-b] ", remove = "[a-b]")
@@ -401,7 +442,7 @@ 

Developers

-

Site built with pkgdown 1.5.0.

+

Site built with pkgdown 1.5.1.

diff --git a/docs/news/index.html b/docs/news/index.html index 74b4593..af104a2 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -73,7 +73,7 @@ cleaner - 1.4.0.9000 + 1.5.0
@@ -116,9 +116,9 @@

Changelog

Source: NEWS.md
-
-

-cleaner 1.4.0.9000 Unreleased +
+

+cleaner 1.5.0 Unreleased

  • @@ -146,22 +146,22 @@

    starwars %>% na_replace() # replace NAs in all columns ("" for hair_color and 0 for birth_year)

+
  • Support for the upcoming R 4.1.0

  • -
    +

    -cleaner 1.4.0 2020-04-01 +cleaner 1.4.0 2020-04-01

      -
    • New function rdate() to generate random dates (in analogy to e.g. runif())
    • -
    • Frequency tables (freq()): +
    • New function rdate() to generate random dates (in analogy to e.g. runif())

    • +
    • +

      Frequency tables (freq()):

        -
      • Added availability of data to header
      • -
      • Fix for using na.rm -
      • -
      • Fix for transforming to a visual histogram with hist() -
      • -
      • New method for using format() on a frequency table
      • +
      • Added availability of data to header

      • +
      • Fix for using na.rm

      • +
      • Fix for transforming to a visual histogram with hist()

      • +
      • New method for using format() on a frequency table

      • New method for transforming the values of a frequency table to a vector with as.vector(), which also supports dates

        library(dplyr)
        @@ -172,8 +172,8 @@ 

    • -
    • Fix for clean_Date() not accepting already POSIX or Date input
    • -
    • When using clean_Date(..., guess_each = TRUE) it now accepts the format parameter as a vector of options to let it choose from
    • +
    • Fix for clean_Date() not accepting already POSIX or Date input

    • +
    • When using clean_Date(..., guess_each = TRUE) it now accepts the format parameter as a vector of options to let it choose from

    • clean_Date() and clean_POSIXct gained a parameter max_date (that defaults to today), so that they will never return years beyond a specified date:

      # old
      @@ -191,19 +191,19 @@ 

    • Cleaned all code using the lintr package

    -
    +

    -cleaner 1.3.1 2020-01-31 +cleaner 1.3.1 2020-01-31

    -
    +

    -cleaner 1.3.0 2020-01-24 +cleaner 1.3.0 2020-01-24

    • Added functions clean_double() and clean_integer() @@ -213,9 +213,9 @@

    • Fixed a bug in frequency tables where sometimes the number of digits used for percentages would be astronomical
    -
    +

    -cleaner 1.2.0 2019-11-05 +cleaner 1.2.0 2019-11-05

    • DUE TO CRAN POLICY: RENAMED TO PACKAGE TO cleaner
    • @@ -227,9 +227,9 @@

    • Fix for digits in frequency tables for numeric values
    -
    +

    -clean 1.1.0 Unreleased +clean 1.1.0 Unreleased

    • Added support for currency as a new class: as.currency() and clean_currency(). They also come with ‘S3 methods’ for print, format, sum, min and max.
    • @@ -246,9 +246,9 @@

    • Fix for freq() where the precentage of NAs in the header was not calculated right
    -
    +

    -clean 1.0.0 Unreleased +clean 1.0.0 Unreleased

    • First release
    • @@ -271,7 +271,7 @@

      Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 34b80c1..0ed49f9 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,6 +1,6 @@ -pandoc: 2.3.1 -pkgdown: 1.5.0 +pandoc: 2.7.3 +pkgdown: 1.5.1 pkgdown_sha: ~ articles: [] -last_built: 2020-04-23T09:34Z +last_built: 2020-06-01T13:42Z diff --git a/docs/reference/clean.html b/docs/reference/clean.html index 51c5a30..02bb47f 100644 --- a/docs/reference/clean.html +++ b/docs/reference/clean.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0
    @@ -326,7 +326,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/currency.html b/docs/reference/currency.html index 9fc6424..64d4b12 100644 --- a/docs/reference/currency.html +++ b/docs/reference/currency.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0
    @@ -198,7 +198,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/format_datetime.html b/docs/reference/format_datetime.html index db6a6ff..c454866 100644 --- a/docs/reference/format_datetime.html +++ b/docs/reference/format_datetime.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0
    @@ -140,11 +140,11 @@

    Value

    Examples

    format_datetime("yyyy-mm-dd")
    #> [1] "%Y-%m-%d"
    # Very hard to remember all these characters: -format(Sys.time(), "%a %b %d %Y %X")
    #> [1] "Thu Apr 23 2020 11:34:05"
    +format(Sys.time(), "%a %b %d %Y %X")
    #> [1] "Mon Jun 01 2020 15:42:11"
    # Easy to remember and write the same as above: -format(Sys.time(), format_datetime("ddd mmm dd yyyy HH:MM:ss"))
    #> [1] "Thu Apr 23 2020 11:34:05"
    +format(Sys.time(), format_datetime("ddd mmm dd yyyy HH:MM:ss"))
    #> [1] "Mon Jun 01 2020 15:42:11"
    # seconds since the Epoch, 1970-01-01 00:00:00 -format(Sys.time(), format_datetime("epoch"))
    #> [1] "1587634445"
    +format(Sys.time(), format_datetime("epoch"))
    #> [1] "1591018931"
    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/format_names.html b/docs/reference/format_names.html index c414501..bd47316 100644 --- a/docs/reference/format_names.html +++ b/docs/reference/format_names.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0
    @@ -162,8 +162,8 @@

    Examp name_123def = "value", This.is.a.column = "value") -format_names(df, snake_case = TRUE)
    #> name_341abc name_123def this_is_a_column -#> 1 value value value
    +format_names(df, snake_case = TRUE)
    #> name_341_abc name_123def this_is_a_column +#> 1 value value value
    format_names(df, camelCase = TRUE)
    #> name341ABC name123def thisIsAColumn #> 1 value value value
    format_names(df, letters[1:3])
    #> a b c @@ -172,20 +172,20 @@

    Examp #> 1 value value value

    rownames(mtcars) <- format_names(rownames(mtcars), snake_case = TRUE) mtcars[, 1:5]
    #> mpg cyl disp hp drat -#> mazda_rx4 21.0 6 160.0 110 3.90 -#> mazda_rx4_wag 21.0 6 160.0 110 3.90 +#> mazda_rx_4 21.0 6 160.0 110 3.90 +#> mazda_rx_4_wag 21.0 6 160.0 110 3.90 #> datsun_710 22.8 4 108.0 93 3.85 #> hornet_4_drive 21.4 6 258.0 110 3.08 #> hornet_sportabout 18.7 8 360.0 175 3.15 #> valiant 18.1 6 225.0 105 2.76 #> duster_360 14.3 8 360.0 245 3.21 -#> merc_240d 24.4 4 146.7 62 3.69 +#> merc_240_d 24.4 4 146.7 62 3.69 #> merc_230 22.8 4 140.8 95 3.92 #> merc_280 19.2 6 167.6 123 3.92 -#> merc_280c 17.8 6 167.6 123 3.92 -#> merc_450se 16.4 8 275.8 180 3.07 -#> merc_450sl 17.3 8 275.8 180 3.07 -#> merc_450slc 15.2 8 275.8 180 3.07 +#> merc_280_c 17.8 6 167.6 123 3.92 +#> merc_450_se 16.4 8 275.8 180 3.07 +#> merc_450_sl 17.3 8 275.8 180 3.07 +#> merc_450_slc 15.2 8 275.8 180 3.07 #> cadillac_fleetwood 10.4 8 472.0 205 2.93 #> lincoln_continental 10.4 8 460.0 215 3.00 #> chrysler_imperial 14.7 8 440.0 230 3.23 @@ -195,15 +195,15 @@

    Examp #> toyota_corona 21.5 4 120.1 97 3.70 #> dodge_challenger 15.5 8 318.0 150 2.76 #> amc_javelin 15.2 8 304.0 150 3.15 -#> camaro_z28 13.3 8 350.0 245 3.73 +#> camaro_z_28 13.3 8 350.0 245 3.73 #> pontiac_firebird 19.2 8 400.0 175 3.08 -#> fiat_x1_9 27.3 4 79.0 66 4.08 +#> fiat_x_1_9 27.3 4 79.0 66 4.08 #> porsche_914_2 26.0 4 120.3 91 4.43 #> lotus_europa 30.4 4 95.1 113 3.77 #> ford_pantera_l 15.8 8 351.0 264 4.22 #> ferrari_dino 19.7 6 145.0 175 3.62 #> maserati_bora 15.0 8 301.0 335 3.54 -#> volvo_142e 21.4 4 121.0 109 4.11

    +#> volvo_142_e 21.4 4 121.0 109 4.11
    format_names(list(a = 1, b = 2), c("new_1", "new_2"))
    #> $new_1 #> [1] 1 #> @@ -233,7 +233,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/freq.html b/docs/reference/freq.html index 706c803..410b805 100644 --- a/docs/reference/freq.html +++ b/docs/reference/freq.html @@ -75,7 +75,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -307,7 +307,7 @@

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/index.html b/docs/reference/index.html index 7fcb6b6..c29bfc5 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -73,7 +73,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -214,7 +214,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/na_replace.html b/docs/reference/na_replace.html index cbb6b96..e98886b 100644 --- a/docs/reference/na_replace.html +++ b/docs/reference/na_replace.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -174,7 +174,7 @@

    Examp #> [1,] 1 0 #> [2,] 2 3
    na_replace(c(1, 2, NA, NA))
    #> [1] 1 2 0 0
    na_replace(c(1, 2, NA, NA), replacement = -1)
    #> [1] 1 2 -1 -1
    na_replace(c(1, 2, NA, NA), replacement = c(0, -1))
    #> [1] 1 2 0 -1
    -na_replace(c(Sys.Date(), NA)) # replacement defaults to 'today'
    #> [1] "2020-04-23" "2020-04-23"
    +na_replace(c(Sys.Date(), NA)) # replacement defaults to 'today'
    #> [1] "2020-06-01" "2020-06-01"
    na_replace(c(TRUE, FALSE, NA))
    #> [1] TRUE FALSE FALSE
    na_replace(c(TRUE, FALSE, NA), replacement = TRUE)
    #> [1] TRUE FALSE TRUE
    # we're flexible, the class only remains the same if # the replacement value allows it @@ -210,7 +210,7 @@

    Examp starwars %>% group_by(hair_color) %>% na_replace(hair_color, replacement = "TEST!") %>% - summarise(n = n()) + summarise(n = n()) }

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/percentage.html b/docs/reference/percentage.html index ad2e589..1c94fd8 100644 --- a/docs/reference/percentage.html +++ b/docs/reference/percentage.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -177,7 +177,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/rdate.html b/docs/reference/rdate.html index bb4504c..98c2f24 100644 --- a/docs/reference/rdate.html +++ b/docs/reference/rdate.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -166,7 +166,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/regex_true_false.html b/docs/reference/regex_true_false.html index ffdbd03..45820f2 100644 --- a/docs/reference/regex_true_false.html +++ b/docs/reference/regex_true_false.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -150,7 +150,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/unclean.html b/docs/reference/unclean.html index 53b5669..ed500b5 100644 --- a/docs/reference/unclean.html +++ b/docs/reference/unclean.html @@ -74,7 +74,7 @@ cleaner - 1.4.0.9000 + 1.5.0 @@ -152,7 +152,7 @@

    Contents

    -

    Site built with pkgdown 1.5.0.

    +

    Site built with pkgdown 1.5.1.