tidyfreud v1.0.0

gygl · Jun 24, 2024 · 5912761 · 5912761
1 parent 41f742c
commit 5912761
Show file tree

Hide file tree

Showing 10 changed files with 91 additions and 92 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,11 @@
 Package: tidyfreud
 Type: Package
 Title: Dataset with Sigmund Freud's complete work in tidy format
-Version: 0.2.0
+Version: 1.0.0
 Author: Lucien Gyger <[email protected]>
 Maintainer: Lucien Gyger <[email protected]>
 Description: Data package that contains the complete work of Sigmund Freud in 
-    a tidy format ready for NLP.
+    a tidy format tokenized by page, by sentence and by word.
 License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true

diff --git a/R/create-sigmund-freud-complete-work-tibble.R b/R/create-sigmund-freud-complete-work-tibble.R
@@ -3,7 +3,7 @@
 #'
 #' @param path_pdf string, path to pdf
 #'
-#' @return list, contains 2 dataframe with text by page and text by line
+#' @return list, contains 3 dataframes with text by page, by line and by words
 #' @export
 #'
 create_sfreud_complete_work_tibble = function(path_pdf) {
@@ -200,23 +200,27 @@ create_sfreud_complete_work_tibble = function(path_pdf) {
   text_by_sntnce = text_by_line %>%
     group_by(pg_all, publi_yr, writ_yr, title, subtitle, signature, date_letters) %>%
     summarize(text = paste(text, collapse = " ")) %>%
-    mutate(text = map(text, ~(str_split(.x, "(?<=[\\.])")))) %>%
-    unnest(text) %>% unnest(text) %>%
-    mutate(text = trimws(text)) %>%
-    filter(nchar(text) != 0) %>%
+    mutate(sentence = map(text, ~(str_split(.x, "(?<=[\\.])")))) %>%
+    unnest(sentence) %>% unnest(sentence) %>%
+    mutate(sentence = trimws(sentence)) %>%
+    filter(nchar(sentence) != 0) %>%
     group_by(pg_all) %>% mutate(sntce_nb = 1:n()) %>%
     ungroup() %>%
     mutate(pg_title = pg_all - min(pg_all) + 1) %>%
-    select(pg_all, publi_yr, writ_yr, title, pg_title, sntce_nb, subtitle, text, signature, date_letters)
+    select(pg_all, publi_yr, writ_yr, title, pg_title, sntce_nb, subtitle, sentence, signature, date_letters)
 
 
   # by word
-  text_by_word = text_by_line %>%
+  text_by_word = text_by_sntnce %>%
     # tokenize to words
-    unnest_tokens(word, text) %>%
-    anti_join(stop_words) %>%
+    unnest_tokens(word, sentence) %>%
+    left_join(
+      stop_words %>% select(-lexicon) %>%
+        mutate(is_stop_word = TRUE) %>%
+        distinct) %>%
     # remove digits only
-    filter(str_detect(word, "^\\d+$", negate=TRUE))
+    filter(str_detect(word, "^\\d+$", negate=TRUE)) %>%
+    mutate(is_stop_word = ifelse(is.na(is_stop_word), FALSE, is_stop_word))
 
   # setdiff(text_by_line$title %>% unique, titles_tbl$titles)
   # setdiff(titles_tbl$titles, text_by_line$title %>% unique)

diff --git a/R/data.R b/R/data.R
@@ -1,50 +1,48 @@
-#' Sigmund Freud data
+#' Tibble with Sigmund Freud's complete work by page
 #'
 #' Sigmund Freud complete work by pages
 #'
-#' @format # `freud_page`
-#' A data frame with 5,102 rows and 2 columns:
+#' @format
+#' `freud_page` a data frame with 5,102 rows and 2 columns:
 #' \describe{
-#'   \item{pg_all}{page of original PDF file}
-#'   \item{text}{raw text from each page}
+#'   \item{pg_all}{integer, page of original PDF file}
+#'   \item{text}{character, raw text from each page}
 #'   }
 #'
-#' @source <https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf>
+#' @source \href{https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf}{Text source}
 "freud_page"
 
 
-#' Sigmund Freud complete work by sentences
+#' Tibble with Sigmund Freud's complete work by sentences
 #'
-#' @format # `freud_sntnce`
-#' A data frame with 125,571 rows and 6 columns:
+#' @format `freud_sntnce` a data frame with 125,571 rows and 6 columns:
 #'
 #' \describe{
-#'   \item{pg_all}{page number in original PDF file}
-#'   \item{publi_yr}{year of book / article publication}
-#'   \item{writ_yr}{year of book / article writing}
-#'   \item{title}{title of book / article}
-#'   \item{pg_title}{page number in book / article}
-#'   \item{text}{raw text from each sentence}
+#'   \item{pg_all}{integer, page number in original PDF file}
+#'   \item{publi_yr}{integer, year of book / article publication}
+#'   \item{writ_yr}{integer, year of book / article writing}
+#'   \item{title}{factor, title of book / article}
+#'   \item{pg_title}{integer, page number in book / article}
+#'   \item{sentence}{character, raw text from each sentence}
 #' }
-#' @source <https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf>
+#' @source \href{https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf}{Text source}
 "freud_sntnce"
 
-#' #' Sigmund Freud complete work by words
+#' Tibble with Sigmund Freud's complete work by words
 #'
 #' Sigmund Freud complete work by word, stop words have been removed using
 #' tidytext::stop_words according to lexicon onix, SMART, and snowball
 #'
-#' @format ## `freud_word`
-#' A data frame with 125,571 rows and 6 columns:
+#' @format `freud_word` a data frame with 125,571 rows and 6 columns:
 #'
 #' \describe{
-#'   \item{pg_all}{page number in original PDF file}
-#'   \item{publi_yr}{year of book / article publication}
-#'   \item{writ_yr}{year of book / article writing}
-#'   \item{title}{title of book / article}
-#'   \item{pg_title}{page number in book / article}
-#'   \item{wor}{word from each page, without stop words}
-#'   ...
+#'   \item{pg_all}{integer, page number in original PDF file}
+#'   \item{publi_yr}{integer, year of book / article publication}
+#'   \item{writ_yr}{integer, year of book / article writing}
+#'   \item{title}{factor, title of book / article}
+#'   \item{pg_title}{integer, page number in book / article}
+#'   \item{word}{character, word from each page, without stop words}
+#'   \item{is_stop_word}{logical, TRUE if word is a stop word (as define in [tidytext::stop_word()])}
 #' }
-#' @source <https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf>
+#' @source \href{https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf}{Text source}
 "freud_word"
diff --git a/_targets/meta/meta b/_targets/meta/meta
@@ -1,14 +1,14 @@
 name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error
-.Random.seed|object|58631355bc8bf2c9|||||||||||||||
+.Random.seed|object|e59d697c8306747b|||||||||||||||
 concatenate_consecutive_subtitles|function|7b5298075293f0df|||||||||||||||
-create_sfreud_complete_work_tibble|function|b6b358b548738e39|||||||||||||||
+create_sfreud_complete_work_tibble|function|8ed48f5b9be18afc|||||||||||||||
 display_a_page|function|73d666217acaa204|||||||||||||||
-freud|stem|5f3efc2a2fa5258e|41ec31a8c2c983ed|feb72ff9af80a592|846140273||t19897.8683082789s|dcbf51980bfd6b9d|11683207|rds|local|vector|||9.47||
+freud|stem|a83b346c44b565ab|41ec31a8c2c983ed|f554b6e75c2dd5e4|846140273||t19898.7225816573s|13707e6fd0ad92cf|14694398|rds|local|vector|||13.39||
 freud_line|stem|d731cf8310cd6ccb|e554bd5f0e19d38c|f3cdc296bcd6e5ce|1988123539||t19897.8683247404s|a1c456b03adbfbf8|44|rds|local|vector|||0.01||
-freud_page|stem|050b7706c2384cc0|f1373dc45e14ca7d|f3cdc296bcd6e5ce|-1579811211||t19897.8683167805s|d77aeeb0b024b5e1|4385126|rds|local|vector|||0||
-freud_sntnce|stem|4572337043a747e5|e0674643793b09f3|f3cdc296bcd6e5ce|1172296846||t19898.2480053923s|f56930cc4dc64680|4381489|rds|local|vector|||0||
-freud_word|stem|77f6b4dd3b6b830a|bf48ec9bd6c5b994|f3cdc296bcd6e5ce|-567062330||t19897.868324316s|33dd81572a3a3f1d|2896546|rds|local|vector|||0.01||
+freud_page|stem|050b7706c2384cc0|f1373dc45e14ca7d|89065a06d940e101|-1579811211||t19898.7226062273s|d77aeeb0b024b5e1|4385126|rds|local|vector|||0||
+freud_sntnce|stem|3b8c071005d4369d|e0674643793b09f3|89065a06d940e101|1172296846||t19898.7225945735s|f56930cc4dc64680|4381489|rds|local|vector|||0||
+freud_word|stem|1e4df77e0e6993a9|bf48ec9bd6c5b994|89065a06d940e101|-567062330||t19898.7226329591s|38463dbccd45d0e5|5921453|rds|local|vector|||0||
 path_pdf|stem|5015d9729add2bf6|322cb8b91b3c70b3|ef46db3751d8e999|-399995618|data/Freud_Complete_Works.pdf|t19889.7373840556s|c56f380039ea53ff|11187482|file|local|vector|||0||
 simplify_title|function|d53ae6a8b5c530cd|||||||||||||||
 use_data|function|c168c15a8a661f1e|||||||||||||||
-use_data_for_pkg|stem|d731cf8310cd6ccb|1aab59c92ade7c53|86e09299daf397fa|-2061958436||t19898.2481214103s|a1c456b03adbfbf8|44|rds|local|vector|||9.39||
+use_data_for_pkg|stem|d731cf8310cd6ccb|1aab59c92ade7c53|244f6411b296eda6|-2061958436||t19898.7228958076s|a1c456b03adbfbf8|44|rds|local|vector|||22.67||
diff --git a/data/freud_sntnce.rda b/data/freud_sntnce.rda
diff --git a/data/freud_word.rda b/data/freud_word.rda
diff --git a/man/freud_line.Rd b/man/freud_line.Rd
diff --git a/man/freud_page.Rd b/man/freud_page.Rd
diff --git a/man/freud_sntnce.Rd b/man/freud_sntnce.Rd
diff --git a/man/freud_word.Rd b/man/freud_word.Rd