-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add option to remove default stopwords from word summary
- Loading branch information
CMG203
authored and
CMG203
committed
Nov 16, 2024
1 parent
6388644
commit f9e599a
Showing
4 changed files
with
66 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,3 +9,4 @@ sphinx_rtd_theme>=0.4.3 | |
sphinx-autodoc-typehints>=1.10.3 | ||
sphinx-multiversion>=0.2.3 | ||
autodoc_pydantic | ||
nltk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,5 @@ pytest-spark | |
nbval | ||
pyarrow | ||
twine>=3.1.1 | ||
kaggle | ||
kaggle | ||
nltk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
37 changes: 31 additions & 6 deletions
37
tests/unit/test_pandas/test_describe_categorical_pandas.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,48 @@ | ||
import pandas as pd | ||
import pytest | ||
|
||
from ydata_profiling.model.pandas.describe_categorical_pandas import word_summary_vc | ||
|
||
value_counts_w_words = pd.Series(index=["The dog", "is hungry"], data=[2, 1]) | ||
|
||
|
||
# Test the basic word summary function | ||
def test_word_summary_vc(): | ||
assert ( | ||
word_summary_vc(vc=value_counts_w_words)["word_counts"].to_dict() | ||
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=False)["word_counts"].to_dict() | ||
== pd.Series(index=["the", "dog", "is", "hungry"], data=[2, 2, 1, 1]).to_dict() | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("stop_words", [["The"], ["the", "a"]]) | ||
# Test word summary function with custom stop words | ||
@pytest.mark.parametrize("stop_words", [["the"], ["the", "a"]]) | ||
def test_word_summary_vc_with_stop_words(stop_words): | ||
assert ( | ||
word_summary_vc(vc=value_counts_w_words, stop_words=stop_words)[ | ||
word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=False)[ | ||
"word_counts" | ||
].to_dict() | ||
== pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict() | ||
) | ||
|
||
# Test word summary function with default stopwords removed | ||
def test_word_summary_vc_with_default_stopwords(): | ||
assert ( | ||
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True)["word_counts"].to_dict() | ||
== pd.Series(index=["dog", "hungry"], data=[2, 1]).to_dict() | ||
) | ||
|
||
# Test word summary function with both custom and default stop words | ||
@pytest.mark.parametrize( | ||
"stop_words, expected", | ||
[ | ||
(["dog"], {"hungry": 1}), # Custom stop word "dog", "is" removed as a default stopword | ||
(["the", "is"], {"dog": 2, "hungry": 1}), # Custom stop words "the" and "is" | ||
], | ||
) | ||
def test_word_summary_vc_with_custom_and_default_stop_words(stop_words, expected): | ||
result = word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=True)["word_counts"].to_dict() | ||
assert result == expected | ||
|
||
# Test word summary function with keep_stopwords | ||
def test_word_summary_vc_with_keep_stopwords(): | ||
assert ( | ||
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True, keep_stopwords=["is"])["word_counts"].to_dict() | ||
== pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict() | ||
) |