Skip to content

Commit

Permalink
fuzzy_dedpue filter addition
Browse files Browse the repository at this point in the history
Signed-off-by: Rucha Apte <[email protected]>
  • Loading branch information
ruchaa-apte committed Oct 24, 2024
1 parent 7d7767b commit e177052
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 14 deletions.
8 changes: 5 additions & 3 deletions tutorials/dapt-curation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,10 @@ The tutorial follows the steps below:<br>

After installing the NeMo Curator package, install the dependencies and run:

`pip install -r code/requirements.txt`

`python code/main.py`
```bash
pip install -r code/requirements.txt
cd code
python main.py
```

This will download chip-design related datasets and begin the data curation pipeline.
37 changes: 29 additions & 8 deletions tutorials/dapt-curation/code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
filter_code,
filter_text,
redact_code,
fuzzy_dedupe
)

import nemo_curator as nc
Expand Down Expand Up @@ -168,16 +169,30 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
)

dataset_text = curation_steps_text(orig_dataset_text)
dataset_text = dataset_text.persist()
gpu_dataset_text = DocumentDataset(dataset_text.df.to_backend("cudf"))

dataset_code = curation_steps_code(orig_dataset_code)
gpu_dataset_code = DocumentDataset(dataset_code.df.to_backend("cudf"))

print("Executing the fuzzy dedupe pipeline...")
fuzzy_dataset_text = fuzzy_dedupe(dataset=gpu_dataset_text, type='text')
fuzzy_dataset_code = fuzzy_dedupe(dataset=gpu_dataset_code, type='code')

gpu_dataset_text = fuzzy_dataset_text.df.to_backend("pandas")
gpu_dataset_code = fuzzy_dataset_code.df.to_backend("pandas")

fuzzy_dataset_text = fuzzy_dataset_text.persist()
fuzzy_dataset_code = fuzzy_dataset_code.persist()

print(f"Original dataset length for text files: {len(orig_dataset_text.df)}")
print(f"After dataprep: {len(dataset_text.df)}")

dataset_code = curation_steps_code(orig_dataset_code)
dataset_code = dataset_code.persist()
print(f"After fuzzy dedupe: {len(fuzzy_dataset_text.df)}")

print(f"Original dataset length for code files: {len(orig_dataset_code.df)}")
print(f"After dataprep: {len(dataset_code.df)}")
print(f"After fuzzy dedupe: {len(fuzzy_dataset_code.df)}")

print("Writing the results to disk...")

# Overwrite existing files in the curated directory.
out_path = os.path.join(DATA_DIR, "curated")
Expand All @@ -186,15 +201,18 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
shutil.rmtree(out_path)

os.makedirs(out_path)
dataset_text.to_json(out_path, write_to_filename=True)
dataset_code.to_json(out_path, write_to_filename=True)
fuzzy_dataset_text.to_json(out_path, write_to_filename=True)
fuzzy_dataset_code.to_json(out_path, write_to_filename=True)

print('Writing results to disk completed')

# Split the dataset by file category and save curated files (optional - to create blended datasets)
print('Split dataset by metadata')
separated_data_text = separate_by_metadata(
dataset_text.df, out_path, "category"
fuzzy_dataset_text.df, out_path, "category"
).compute()
separated_data_code = separate_by_metadata(
dataset_code.df, out_path, "category"
fuzzy_dataset_code.df, out_path, "category"
).compute()

client.close()
Expand Down Expand Up @@ -234,6 +252,7 @@ def main():
args = ArgumentHelper(parser).add_distributed_args().parse_args()
# Limit the total number of workers to ensure we don't run out of memory.
args.n_workers = min(args.n_workers, 8)
args.device='gpu'
print("Args: ", args)

# Download all the sources and get the list of text and code files.
Expand All @@ -250,7 +269,9 @@ def main():
]
dataset_weights = [1.0, 4.0, 4.0, 1.0]
target_size = 20
print('Data Curation completed')
blend_and_shuffle(args, dataset_paths, dataset_weights, target_size)
print('Data Blending completed')


if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion tutorials/dapt-curation/code/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
arxiv
arxiv==2.1.0
arxiv-downloader
cchardet
poppler-utils
unstructured[all-docs]==0.14.5
unstructured[pdf]
nltk==3.8.1
29 changes: 27 additions & 2 deletions tutorials/dapt-curation/code/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import dask.dataframe as dd
import pandas as pd

from nemo_curator import ExactDuplicates, Modify, ScoreFilter, Sequential
from nemo_curator import ExactDuplicates, Modify, ScoreFilter, Sequential, FuzzyDuplicates, FuzzyDuplicatesConfig
from nemo_curator.datasets import DocumentDataset
from nemo_curator.filters import (
DocumentFilter,
Expand All @@ -39,7 +39,6 @@
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.file_utils import get_all_files_paths_under


class QuotationUnifier(DocumentModifier):
"""
A simple modifier that unifies the quotation marks in the documents.
Expand Down Expand Up @@ -281,6 +280,32 @@ def dedupe(dataset: DocumentDataset) -> DocumentDataset:
deduped = dataset_df[~dataset_df.id.isin(duplicate_ids)]
return DocumentDataset(deduped)

def fuzzy_dedupe(dataset: DocumentDataset, type: str = 'text') -> DocumentDataset:
cache_dir = f"./workspace/{type}"
fuzzy_dedup_config = FuzzyDuplicatesConfig(
cache_dir=cache_dir,
id_field="id",
text_field="text",
seed=42,
char_ngrams=20,
num_buckets=20,
hashes_per_bucket=13,
use_64_bit_hash=False,
buckets_per_shuffle=5,
false_positive_check=False,
num_anchors=2,
jaccard_threshold=0.8,
)
fuzzy_dup = FuzzyDuplicates(config=fuzzy_dedup_config)
duplicates = fuzzy_dup(dataset)

docs_to_remove = duplicates.df.map_partitions(lambda x: x[x.group.duplicated(keep="first")])

# When there are few duplicates we can compute the results to a list and use `isin`.
duplicate_ids=docs_to_remove.compute().id.to_arrow().to_pylist()
dataset_df = dataset.df
deduped = dataset_df[~dataset_df.id.isin(duplicate_ids)]
return DocumentDataset(deduped)

class TextLineCountFilter(DocumentFilter):
"""
Expand Down

0 comments on commit e177052

Please sign in to comment.