Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Oct 24, 2024
1 parent 769db8c commit 914cc06
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 27 deletions.
18 changes: 9 additions & 9 deletions tutorials/dapt-curation/code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
dedupe,
filter_code,
filter_text,
fuzzy_dedupe,
redact_code,
fuzzy_dedupe
)

import nemo_curator as nc
Expand Down Expand Up @@ -175,8 +175,8 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
gpu_dataset_code = DocumentDataset(dataset_code.df.to_backend("cudf"))

print("Executing the fuzzy dedupe pipeline...")
fuzzy_dataset_text = fuzzy_dedupe(dataset=gpu_dataset_text, type='text')
fuzzy_dataset_code = fuzzy_dedupe(dataset=gpu_dataset_code, type='code')
fuzzy_dataset_text = fuzzy_dedupe(dataset=gpu_dataset_text, type="text")
fuzzy_dataset_code = fuzzy_dedupe(dataset=gpu_dataset_code, type="code")

gpu_dataset_text = fuzzy_dataset_text.df.to_backend("pandas")
gpu_dataset_code = fuzzy_dataset_code.df.to_backend("pandas")
Expand Down Expand Up @@ -204,10 +204,10 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
fuzzy_dataset_text.to_json(out_path, write_to_filename=True)
fuzzy_dataset_code.to_json(out_path, write_to_filename=True)

print('Writing results to disk completed')
print("Writing results to disk completed")

# Split the dataset by file category and save curated files (optional - to create blended datasets)
print('Split dataset by metadata')
print("Split dataset by metadata")
separated_data_text = separate_by_metadata(
fuzzy_dataset_text.df, out_path, "category"
).compute()
Expand Down Expand Up @@ -252,7 +252,7 @@ def main():
args = ArgumentHelper(parser).add_distributed_args().parse_args()
# Limit the total number of workers to ensure we don't run out of memory.
args.n_workers = min(args.n_workers, 8)
args.device='gpu'
args.device = "gpu"
print("Args: ", args)

# Download all the sources and get the list of text and code files.
Expand All @@ -269,9 +269,9 @@ def main():
]
dataset_weights = [1.0, 4.0, 4.0, 1.0]
target_size = 20
print('Data Curation completed')
print("Data Curation completed")
blend_and_shuffle(args, dataset_paths, dataset_weights, target_size)
print('Data Blending completed')
print("Data Blending completed")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion tutorials/dapt-curation/code/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
arxiv==2.1.0
arxiv-downloader
cchardet
nltk==3.8.1
poppler-utils
unstructured[all-docs]==0.14.5
unstructured[pdf]
nltk==3.8.1
46 changes: 29 additions & 17 deletions tutorials/dapt-curation/code/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@
import dask.dataframe as dd
import pandas as pd

from nemo_curator import ExactDuplicates, Modify, ScoreFilter, Sequential, FuzzyDuplicates, FuzzyDuplicatesConfig
from nemo_curator import (
ExactDuplicates,
FuzzyDuplicates,
FuzzyDuplicatesConfig,
Modify,
ScoreFilter,
Sequential,
)
from nemo_curator.datasets import DocumentDataset
from nemo_curator.filters import (
DocumentFilter,
Expand All @@ -39,6 +46,7 @@
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.file_utils import get_all_files_paths_under


class QuotationUnifier(DocumentModifier):
"""
A simple modifier that unifies the quotation marks in the documents.
Expand Down Expand Up @@ -280,33 +288,37 @@ def dedupe(dataset: DocumentDataset) -> DocumentDataset:
deduped = dataset_df[~dataset_df.id.isin(duplicate_ids)]
return DocumentDataset(deduped)

def fuzzy_dedupe(dataset: DocumentDataset, type: str = 'text') -> DocumentDataset:

def fuzzy_dedupe(dataset: DocumentDataset, type: str = "text") -> DocumentDataset:
cache_dir = f"./workspace/{type}"
fuzzy_dedup_config = FuzzyDuplicatesConfig(
cache_dir=cache_dir,
id_field="id",
text_field="text",
seed=42,
char_ngrams=20,
num_buckets=20,
hashes_per_bucket=13,
use_64_bit_hash=False,
buckets_per_shuffle=5,
false_positive_check=False,
num_anchors=2,
jaccard_threshold=0.8,
)
cache_dir=cache_dir,
id_field="id",
text_field="text",
seed=42,
char_ngrams=20,
num_buckets=20,
hashes_per_bucket=13,
use_64_bit_hash=False,
buckets_per_shuffle=5,
false_positive_check=False,
num_anchors=2,
jaccard_threshold=0.8,
)
fuzzy_dup = FuzzyDuplicates(config=fuzzy_dedup_config)
duplicates = fuzzy_dup(dataset)

docs_to_remove = duplicates.df.map_partitions(lambda x: x[x.group.duplicated(keep="first")])
docs_to_remove = duplicates.df.map_partitions(
lambda x: x[x.group.duplicated(keep="first")]
)

# When there are few duplicates we can compute the results to a list and use `isin`.
duplicate_ids=docs_to_remove.compute().id.to_arrow().to_pylist()
duplicate_ids = docs_to_remove.compute().id.to_arrow().to_pylist()
dataset_df = dataset.df
deduped = dataset_df[~dataset_df.id.isin(duplicate_ids)]
return DocumentDataset(deduped)


class TextLineCountFilter(DocumentFilter):
"""
Discard text files based on number of lines.
Expand Down

0 comments on commit 914cc06

Please sign in to comment.