Skip to content

Commit

Permalink
changes in curator for handling empty value
Browse files Browse the repository at this point in the history
Signed-off-by: tanishq-ids <[email protected]>
  • Loading branch information
tanishq-ids committed Oct 22, 2024
1 parent 0559e04 commit c5ecb81
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 42 deletions.
2 changes: 1 addition & 1 deletion demo/curation/local_cuartion_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class AnnotationData(BaseModel):
extract_json=annotation_data.extract_json,
kpi_mapping_path=annotation_data.kpi_mapping_path,
create_neg_samples=True,
neg_pos_ratio=1,
neg_sample_rate=1,
).create_curator_df()

curator.to_csv(output_file_path_main, index=False)
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re
from typing import List, Tuple, Optional
import pandas as pd
from Levenshtein import distance as levenshtein_distance
from pathlib import Path
from pydantic import BaseModel, FilePath

Expand All @@ -28,7 +29,8 @@ class Curator:
annotation_folder (str): path to the folder containing annotation files
extract_json (str): path to the JSON file containing extracted content
kpi_mapping_path (str): path to KPI Mapping csv
neg_pos_ratio (int): ratio of negative to positive examples
neg_sample_rate (int): number of negative samples to positive examples
create_neg_samples (bool): whether to create negative samples
"""
Expand All @@ -38,15 +40,15 @@ def __init__(
annotation_folder: str,
extract_json: Path,
kpi_mapping_path: str,
neg_pos_ratio: int = 1,
neg_sample_rate: int = 1,
create_neg_samples: bool = False,
) -> None:
"""Initialize the constructor for Curator object."""
self.annotation_folder = annotation_folder
self.extract_json = extract_json
self.json_file_name = os.path.basename(extract_json).replace("_output", "")
self.kpi_mapping_path = kpi_mapping_path
self.neg_pos_ratio = neg_pos_ratio
self.neg_sample_rate = neg_sample_rate
self.create_neg_samples = create_neg_samples

self.pdf_content = self.load_pdf_content()
Expand Down Expand Up @@ -105,7 +107,8 @@ def create_pos_examples(
) -> Tuple[List[Tuple[Optional[str], str]], bool]:
"""Create positive examples based on the provided row from a DataFrame.
Returns a list of matching sentences or an empty list, along with a flag indicating if sentences were found in the JSON.
Returns a list of matching sentences or an empty list, along with a flag
indicating if sentences were found in the JSON.
"""
value: str = row["relevant_paragraphs"]
cleaned_value: str = self.clean_text(value)
Expand All @@ -128,9 +131,11 @@ def create_pos_examples(
return ([(None, "")], False) # Return with in_json_flag as False

source_page = str(row["source_page"])

match = re.search(r"\d+", source_page)
page_number = match.group() if match else None
if match:
page_number = match.group()
else:
return ([(None, "")], False)

if page_number in self.pdf_content:
matching_sentences = [
Expand All @@ -140,19 +145,43 @@ def create_pos_examples(
if any(sentence in para for sentence in sentences)
]

# Flag to know if sentence is available in json or not
in_json_flag = bool(matching_sentences)
return (
matching_sentences
if matching_sentences
else [(None, sentence) for sentence in sentences],
in_json_flag,
)
if matching_sentences:
# If matching sentences found, return them with in_json_flag as True
return matching_sentences, True

# If no exact match found, find the closest paragraph and its ID
closest_para, para_id = self._get_closest_paragraph(sentences, page_number)

if closest_para:
return [(para_id, closest_para)], True

return ([(None, "")], False) # Return with in_json_flag as False
# If no relevant paragraph found, return with in_json_flag as False
return ([(None, "")], False)

def _get_closest_paragraph(
self, sentences: List[str], page_number: str
) -> Tuple[Optional[str], Optional[str]]:
"""Find the closest paragraph on the given page and return it with its ID."""
closest_para = None
closest_para_id = None
min_distance = float("inf")

# Iterate over paragraphs on the page and compute Levenshtein distance
for key_inner in self.pdf_content[page_number]:
para = self.pdf_content[page_number][key_inner]["paragraph"]

# Compute the minimum distance between the target sentences and this paragraph
for sentence in sentences:
dist = levenshtein_distance(sentence, para)
if dist < min_distance:
min_distance = dist
closest_para = para
closest_para_id = key_inner # Store the unique paragraph ID

return closest_para, closest_para_id

def create_neg_examples(self, row: pd.Series) -> List[str]:
"""Create negative examples based on the provided row from a DataFrame.
"""Create negative examples excluding relevant paragraphs or close ones.
Returns a list of context paragraphs or an empty list.
"""
Expand All @@ -164,13 +193,32 @@ def create_neg_examples(self, row: pd.Series) -> List[str]:
):
return [""]

# Flatten all paragraphs from the PDF content
paragraphs = [
self.pdf_content[key_outer][key_inner]["paragraph"]
for key_outer in self.pdf_content
for key_inner in self.pdf_content[key_outer]
]

context = random.choices(paragraphs[1:], k=self.neg_pos_ratio)
relevant_paragraphs = row["relevant_paragraphs"]

# Filter out paragraphs that are identical or too similar
def is_similar(paragraph):
return any(
levenshtein_distance(paragraph, rel_para)
<= 5 # Adjust threshold if needed
for rel_para in relevant_paragraphs
)

negative_paragraphs = [p for p in paragraphs if not is_similar(p)]

# Randomly select `neg_sample_rate` paragraphs from the filtered list
context = (
random.choices(negative_paragraphs, k=self.neg_sample_rate)
if negative_paragraphs
else [""]
)

return context

def create_examples_annotate(self) -> List[pd.DataFrame]:
Expand Down Expand Up @@ -209,18 +257,21 @@ def create_examples_annotate(self) -> List[pd.DataFrame]:
(para, unique_para_id) for unique_para_id, para in pos_examples
]

contexts = [
(pos_contexts, 1),
(
[
(neg_example, None)
for neg_example in self.create_neg_examples(row.copy())
]
if self.create_neg_samples
else [("", None)],
0,
),
]
if self.create_neg_samples:
contexts = [
(pos_contexts, 1),
(
[
(neg_example, None)
for neg_example in self.create_neg_examples(row.copy())
]
if self.create_neg_samples
else [("", None)],
0,
),
]
else:
contexts = [(pos_contexts, 1)]

for context, label in contexts:
if (
Expand Down Expand Up @@ -278,17 +329,23 @@ def create_curator_df(self) -> pd.DataFrame:
kpi_df = pd.read_csv(
self.kpi_mapping_path, usecols=["kpi_id", "question"]
)

merged_df = pd.merge(new_df, kpi_df, on="kpi_id", how="left")

result_df = merged_df.rename(columns={"answer": "annotation_answer"})

result_df.loc[result_df["label"] == 0, "in_extraction_data_flag"] = (
bool(0)
)
# result_df.loc[result_df["label"] == 0, "in_extraction_data_flag"] = (
# bool(0)
# )
result_df.loc[
result_df["in_extraction_data_flag"] == 0, "unique_paragraph_id"
] = None

result_df["annotation_file_name"] = Path(self.annotation_folder).name
result_df["annotation_file_row"] += 2
result_df = result_df[
result_df["context"].notna() & (result_df["context"] != "")
]

# Reorder columns as specified in columns_order
result_df = result_df[columns_order]
Expand Down
25 changes: 16 additions & 9 deletions src/osc_transformer_presteps/run_local_relevance_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,20 @@ def run_local_curation(
kpi_mapping_file_path: str = typer.Argument(
help="This is the path to kpi_mapping.csv file"
),
output_path: str = typer.Argument(
help="Path to directory to save the output curated file.",
),
create_neg_samples: bool = typer.Option(
False,
"--create_neg_samples",
show_default=True,
help="Boolean to declare if you want to include negative samples in your dataset.",
),
neg_pos_ratio: int = typer.Option(
neg_sample_rate: int = typer.Option(
1,
"--neg_pos_ratio",
"--neg_sample_rate",
show_default=True,
help="Ratio of number of negative samples you want per positive samples.",
help="Number of negative samples you want per positive samples.",
),
logs_folder: str = typer.Option(
default=None,
Expand Down Expand Up @@ -152,15 +155,19 @@ def resolve_path(path_name: str, cwd: Path) -> Path:
annotation_file_path=annotation_temp,
kpi_mapping_file_path=kpi_mapping_temp,
create_neg_samples=create_neg_samples,
neg_pos_ratio=neg_pos_ratio,
neg_sample_rate=neg_sample_rate,
)
curated_data.to_csv("Curated_dataset.csv", index=False)
_logger.info(
f"Added info from file {extracted_json_temp.stem}.json to the curation file."
)

elif extracted_json_temp.is_dir():
files = [f for f in extracted_json_temp.iterdir() if f.is_file()]
files = [
f
for f in extracted_json_temp.iterdir()
if f.is_file() and f.name.endswith(".json")
]
curator_df = pd.DataFrame()

for file in files:
Expand All @@ -170,13 +177,13 @@ def resolve_path(path_name: str, cwd: Path) -> Path:
annotation_file_path=annotation_temp,
kpi_mapping_file_path=kpi_mapping_temp,
create_neg_samples=create_neg_samples,
neg_pos_ratio=neg_pos_ratio,
neg_sample_rate=neg_sample_rate,
)
curator_df = pd.concat([curator_df, temp_df], ignore_index=True)
_logger.info(f"Added info from file {file.stem}.json to the curation file.")

timestamp = datetime.now().strftime("%d%m%Y_%H%M")
csv_filename = f"Curated_dataset_{timestamp}.csv"
csv_filename = Path(output_path) / f"Curated_dataset_{timestamp}.csv"
curator_df.to_csv(csv_filename, index=False)

_logger.info("Curation ended.")
Expand All @@ -187,7 +194,7 @@ def curate_one_file(
annotation_file_path: Path,
kpi_mapping_file_path: Path,
create_neg_samples: bool,
neg_pos_ratio: int,
neg_sample_rate: int,
):
"""Curate data for a given file to a given folder for a specific setting.
Expand All @@ -198,7 +205,7 @@ def curate_one_file(
extract_json=dir_extracted_json_name,
kpi_mapping_path=kpi_mapping_file_path,
create_neg_samples=create_neg_samples,
neg_pos_ratio=neg_pos_ratio,
neg_sample_rate=neg_sample_rate,
).create_curator_df()


Expand Down

0 comments on commit c5ecb81

Please sign in to comment.