Skip to content

Commit

Permalink
feat: Add option to remove default stopwords from word summary
Browse files Browse the repository at this point in the history
  • Loading branch information
CMG203 authored and CMG203 committed Nov 16, 2024
1 parent 6388644 commit f9e599a
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 17 deletions.
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ sphinx_rtd_theme>=0.4.3
sphinx-autodoc-typehints>=1.10.3
sphinx-multiversion>=0.2.3
autodoc_pydantic
nltk
3 changes: 2 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ pytest-spark
nbval
pyarrow
twine>=3.1.1
kaggle
kaggle
nltk
42 changes: 32 additions & 10 deletions src/ydata_profiling/model/pandas/describe_categorical_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk

from ydata_profiling.config import Settings
from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score
Expand All @@ -18,6 +20,9 @@
)


nltk.download('stopwords')


def get_character_counts_vc(vc: pd.Series) -> pd.Series:
series = pd.Series(vc.index, index=vc)
characters = series[series != ""].apply(list)
Expand Down Expand Up @@ -151,41 +156,58 @@ def unicode_summary_vc(vc: pd.Series) -> dict:
return summary


def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
def word_summary_vc(
vc: pd.Series,
stop_words: List[str] = [],
remove_default_stopwords: bool = True,
keep_stopwords: List[str] = []
) -> dict:
"""Count the number of occurrences of each individual word across
all lines of the data Series, then sort from the word with the most
occurrences to the word with the least occurrences. If a list of
stop words is given, they will be ignored.
stop words is given, they will be ignored, along with default
English stopwords if remove_default_stopwords is True.
Args:
vc: Series containing all unique categories as index and their
frequency as value. Sorted from the most frequent down.
stop_words: List of stop words to ignore, empty by default.
remove_default_stopwords: Boolean flag to decide if default
English stopwords should be removed, default is True.
keep_stopwords: List of stop words to keep, even if they are
part of the default or custom stop words.
Returns:
A dict containing the results as a Series with unique words as
index and the computed frequency as value
index and the computed frequency as value.
"""
# TODO: configurable lowercase/punctuation etc.
# TODO: remove punctuation in words
# Convert custom stop words to lowercase
stop_words = {word.lower() for word in stop_words}

# Merge default stop words if enabled
if remove_default_stopwords:
default_stop_words = set(stopwords.words('english'))
stop_words = stop_words.union(default_stop_words)

# Remove any words specified in keep_stopwords
stop_words -= set(word.lower() for word in keep_stopwords)

# Prepare series for word count
series = pd.Series(vc.index, index=vc)
word_lists = series.str.lower().str.split()
words = word_lists.explode().str.strip(string.punctuation + string.whitespace)
word_counts = pd.Series(words.index, index=words)
# fix for pandas 1.0.5
word_counts = word_counts[word_counts.index.notnull()]
word_counts = word_counts.groupby(level=0, sort=False).sum()
word_counts = word_counts.sort_values(ascending=False)

# Remove stop words
if len(stop_words) > 0:
stop_words = [x.lower() for x in stop_words]
word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]
# Exclude stop words
word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]

return {"word_counts": word_counts} if not word_counts.empty else {}



def length_summary_vc(vc: pd.Series) -> dict:
series = pd.Series(vc.index, index=vc)
length = series.str.len()
Expand Down
37 changes: 31 additions & 6 deletions tests/unit/test_pandas/test_describe_categorical_pandas.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,48 @@
import pandas as pd
import pytest

from ydata_profiling.model.pandas.describe_categorical_pandas import word_summary_vc

value_counts_w_words = pd.Series(index=["The dog", "is hungry"], data=[2, 1])


# Test the basic word summary function
def test_word_summary_vc():
assert (
word_summary_vc(vc=value_counts_w_words)["word_counts"].to_dict()
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=False)["word_counts"].to_dict()
== pd.Series(index=["the", "dog", "is", "hungry"], data=[2, 2, 1, 1]).to_dict()
)


@pytest.mark.parametrize("stop_words", [["The"], ["the", "a"]])
# Test word summary function with custom stop words
@pytest.mark.parametrize("stop_words", [["the"], ["the", "a"]])
def test_word_summary_vc_with_stop_words(stop_words):
assert (
word_summary_vc(vc=value_counts_w_words, stop_words=stop_words)[
word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=False)[
"word_counts"
].to_dict()
== pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
)

# Test word summary function with default stopwords removed
def test_word_summary_vc_with_default_stopwords():
assert (
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True)["word_counts"].to_dict()
== pd.Series(index=["dog", "hungry"], data=[2, 1]).to_dict()
)

# Test word summary function with both custom and default stop words
@pytest.mark.parametrize(
"stop_words, expected",
[
(["dog"], {"hungry": 1}), # Custom stop word "dog", "is" removed as a default stopword
(["the", "is"], {"dog": 2, "hungry": 1}), # Custom stop words "the" and "is"
],
)
def test_word_summary_vc_with_custom_and_default_stop_words(stop_words, expected):
result = word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=True)["word_counts"].to_dict()
assert result == expected

# Test word summary function with keep_stopwords
def test_word_summary_vc_with_keep_stopwords():
assert (
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True, keep_stopwords=["is"])["word_counts"].to_dict()
== pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
)

0 comments on commit f9e599a

Please sign in to comment.