Skip to content

Commit

Permalink
tidyfreud v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
gygl committed Jun 24, 2024
1 parent 41f742c commit 5912761
Show file tree
Hide file tree
Showing 10 changed files with 91 additions and 92 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
Package: tidyfreud
Type: Package
Title: Dataset with Sigmund Freud's complete work in tidy format
Version: 0.2.0
Version: 1.0.0
Author: Lucien Gyger <[email protected]>
Maintainer: Lucien Gyger <[email protected]>
Description: Data package that contains the complete work of Sigmund Freud in
a tidy format ready for NLP.
a tidy format tokenized by page, by sentence and by word.
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Expand Down
24 changes: 14 additions & 10 deletions R/create-sigmund-freud-complete-work-tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#'
#' @param path_pdf string, path to pdf
#'
#' @return list, contains 2 dataframe with text by page and text by line
#' @return list, contains 3 dataframes with text by page, by line and by words
#' @export
#'
create_sfreud_complete_work_tibble = function(path_pdf) {
Expand Down Expand Up @@ -200,23 +200,27 @@ create_sfreud_complete_work_tibble = function(path_pdf) {
text_by_sntnce = text_by_line %>%
group_by(pg_all, publi_yr, writ_yr, title, subtitle, signature, date_letters) %>%
summarize(text = paste(text, collapse = " ")) %>%
mutate(text = map(text, ~(str_split(.x, "(?<=[\\.])")))) %>%
unnest(text) %>% unnest(text) %>%
mutate(text = trimws(text)) %>%
filter(nchar(text) != 0) %>%
mutate(sentence = map(text, ~(str_split(.x, "(?<=[\\.])")))) %>%
unnest(sentence) %>% unnest(sentence) %>%
mutate(sentence = trimws(sentence)) %>%
filter(nchar(sentence) != 0) %>%
group_by(pg_all) %>% mutate(sntce_nb = 1:n()) %>%
ungroup() %>%
mutate(pg_title = pg_all - min(pg_all) + 1) %>%
select(pg_all, publi_yr, writ_yr, title, pg_title, sntce_nb, subtitle, text, signature, date_letters)
select(pg_all, publi_yr, writ_yr, title, pg_title, sntce_nb, subtitle, sentence, signature, date_letters)


# by word
text_by_word = text_by_line %>%
text_by_word = text_by_sntnce %>%
# tokenize to words
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
unnest_tokens(word, sentence) %>%
left_join(
stop_words %>% select(-lexicon) %>%
mutate(is_stop_word = TRUE) %>%
distinct) %>%
# remove digits only
filter(str_detect(word, "^\\d+$", negate=TRUE))
filter(str_detect(word, "^\\d+$", negate=TRUE)) %>%
mutate(is_stop_word = ifelse(is.na(is_stop_word), FALSE, is_stop_word))

# setdiff(text_by_line$title %>% unique, titles_tbl$titles)
# setdiff(titles_tbl$titles, text_by_line$title %>% unique)
Expand Down
52 changes: 25 additions & 27 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -1,50 +1,48 @@
#' Sigmund Freud data
#' Tibble with Sigmund Freud's complete work by page
#'
#' Sigmund Freud complete work by pages
#'
#' @format # `freud_page`
#' A data frame with 5,102 rows and 2 columns:
#' @format
#' `freud_page` a data frame with 5,102 rows and 2 columns:
#' \describe{
#' \item{pg_all}{page of original PDF file}
#' \item{text}{raw text from each page}
#' \item{pg_all}{integer, page of original PDF file}
#' \item{text}{character, raw text from each page}
#' }
#'
#' @source <https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf>
#' @source \href{https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf}{Text source}
"freud_page"


#' Sigmund Freud complete work by sentences
#' Tibble with Sigmund Freud's complete work by sentences
#'
#' @format # `freud_sntnce`
#' A data frame with 125,571 rows and 6 columns:
#' @format `freud_sntnce` a data frame with 125,571 rows and 6 columns:
#'
#' \describe{
#' \item{pg_all}{page number in original PDF file}
#' \item{publi_yr}{year of book / article publication}
#' \item{writ_yr}{year of book / article writing}
#' \item{title}{title of book / article}
#' \item{pg_title}{page number in book / article}
#' \item{text}{raw text from each sentence}
#' \item{pg_all}{integer, page number in original PDF file}
#' \item{publi_yr}{integer, year of book / article publication}
#' \item{writ_yr}{integer, year of book / article writing}
#' \item{title}{factor, title of book / article}
#' \item{pg_title}{integer, page number in book / article}
#' \item{sentence}{character, raw text from each sentence}
#' }
#' @source <https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf>
#' @source \href{https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf}{Text source}
"freud_sntnce"

#' #' Sigmund Freud complete work by words
#' Tibble with Sigmund Freud's complete work by words
#'
#' Sigmund Freud complete work by word, stop words have been removed using
#' tidytext::stop_words according to lexicon onix, SMART, and snowball
#'
#' @format ## `freud_word`
#' A data frame with 125,571 rows and 6 columns:
#' @format `freud_word` a data frame with 125,571 rows and 6 columns:
#'
#' \describe{
#' \item{pg_all}{page number in original PDF file}
#' \item{publi_yr}{year of book / article publication}
#' \item{writ_yr}{year of book / article writing}
#' \item{title}{title of book / article}
#' \item{pg_title}{page number in book / article}
#' \item{wor}{word from each page, without stop words}
#' ...
#' \item{pg_all}{integer, page number in original PDF file}
#' \item{publi_yr}{integer, year of book / article publication}
#' \item{writ_yr}{integer, year of book / article writing}
#' \item{title}{factor, title of book / article}
#' \item{pg_title}{integer, page number in book / article}
#' \item{word}{character, word from each page, without stop words}
#' \item{is_stop_word}{logical, TRUE if word is a stop word (as define in [tidytext::stop_word()])}
#' }
#' @source <https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf>
#' @source \href{https://www.valas.fr/IMG/pdf/Freud_Complete_Works.pdf}{Text source}
"freud_word"
14 changes: 7 additions & 7 deletions _targets/meta/meta
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error
.Random.seed|object|58631355bc8bf2c9|||||||||||||||
.Random.seed|object|e59d697c8306747b|||||||||||||||
concatenate_consecutive_subtitles|function|7b5298075293f0df|||||||||||||||
create_sfreud_complete_work_tibble|function|b6b358b548738e39|||||||||||||||
create_sfreud_complete_work_tibble|function|8ed48f5b9be18afc|||||||||||||||
display_a_page|function|73d666217acaa204|||||||||||||||
freud|stem|5f3efc2a2fa5258e|41ec31a8c2c983ed|feb72ff9af80a592|846140273||t19897.8683082789s|dcbf51980bfd6b9d|11683207|rds|local|vector|||9.47||
freud|stem|a83b346c44b565ab|41ec31a8c2c983ed|f554b6e75c2dd5e4|846140273||t19898.7225816573s|13707e6fd0ad92cf|14694398|rds|local|vector|||13.39||
freud_line|stem|d731cf8310cd6ccb|e554bd5f0e19d38c|f3cdc296bcd6e5ce|1988123539||t19897.8683247404s|a1c456b03adbfbf8|44|rds|local|vector|||0.01||
freud_page|stem|050b7706c2384cc0|f1373dc45e14ca7d|f3cdc296bcd6e5ce|-1579811211||t19897.8683167805s|d77aeeb0b024b5e1|4385126|rds|local|vector|||0||
freud_sntnce|stem|4572337043a747e5|e0674643793b09f3|f3cdc296bcd6e5ce|1172296846||t19898.2480053923s|f56930cc4dc64680|4381489|rds|local|vector|||0||
freud_word|stem|77f6b4dd3b6b830a|bf48ec9bd6c5b994|f3cdc296bcd6e5ce|-567062330||t19897.868324316s|33dd81572a3a3f1d|2896546|rds|local|vector|||0.01||
freud_page|stem|050b7706c2384cc0|f1373dc45e14ca7d|89065a06d940e101|-1579811211||t19898.7226062273s|d77aeeb0b024b5e1|4385126|rds|local|vector|||0||
freud_sntnce|stem|3b8c071005d4369d|e0674643793b09f3|89065a06d940e101|1172296846||t19898.7225945735s|f56930cc4dc64680|4381489|rds|local|vector|||0||
freud_word|stem|1e4df77e0e6993a9|bf48ec9bd6c5b994|89065a06d940e101|-567062330||t19898.7226329591s|38463dbccd45d0e5|5921453|rds|local|vector|||0||
path_pdf|stem|5015d9729add2bf6|322cb8b91b3c70b3|ef46db3751d8e999|-399995618|data/Freud_Complete_Works.pdf|t19889.7373840556s|c56f380039ea53ff|11187482|file|local|vector|||0||
simplify_title|function|d53ae6a8b5c530cd|||||||||||||||
use_data|function|c168c15a8a661f1e|||||||||||||||
use_data_for_pkg|stem|d731cf8310cd6ccb|1aab59c92ade7c53|86e09299daf397fa|-2061958436||t19898.2481214103s|a1c456b03adbfbf8|44|rds|local|vector|||9.39||
use_data_for_pkg|stem|d731cf8310cd6ccb|1aab59c92ade7c53|244f6411b296eda6|-2061958436||t19898.7228958076s|a1c456b03adbfbf8|44|rds|local|vector|||22.67||
Binary file modified data/freud_sntnce.rda
Binary file not shown.
Binary file modified data/freud_word.rda
Binary file not shown.
29 changes: 0 additions & 29 deletions man/freud_line.Rd

This file was deleted.

11 changes: 5 additions & 6 deletions man/freud_page.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions man/freud_sntnce.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 10 additions & 11 deletions man/freud_word.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 5912761

Please sign in to comment.