Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changes in curator for handling empty value #155

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion demo/curation/local_cuartion_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class AnnotationData(BaseModel):
extract_json=annotation_data.extract_json,
kpi_mapping_path=annotation_data.kpi_mapping_path,
create_neg_samples=True,
neg_pos_ratio=1,
neg_sample_rate=1,
).create_curator_df()

curator.to_csv(output_file_path_main, index=False)
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re
from typing import List, Tuple, Optional
import pandas as pd
from Levenshtein import distance as levenshtein_distance
from pathlib import Path
from pydantic import BaseModel, FilePath

Expand All @@ -28,7 +29,8 @@ class Curator:
annotation_folder (str): path to the folder containing annotation files
extract_json (str): path to the JSON file containing extracted content
kpi_mapping_path (str): path to KPI Mapping csv
neg_pos_ratio (int): ratio of negative to positive examples

neg_sample_rate (int): number of negative samples to positive examples
create_neg_samples (bool): whether to create negative samples

"""
Expand All @@ -38,15 +40,15 @@ def __init__(
annotation_folder: str,
extract_json: Path,
kpi_mapping_path: str,
neg_pos_ratio: int = 1,
neg_sample_rate: int = 1,
create_neg_samples: bool = False,
) -> None:
"""Initialize the constructor for Curator object."""
self.annotation_folder = annotation_folder
self.extract_json = extract_json
self.json_file_name = os.path.basename(extract_json).replace("_output", "")
self.kpi_mapping_path = kpi_mapping_path
self.neg_pos_ratio = neg_pos_ratio
self.neg_sample_rate = neg_sample_rate
self.create_neg_samples = create_neg_samples

self.pdf_content = self.load_pdf_content()
Expand Down Expand Up @@ -105,7 +107,8 @@ def create_pos_examples(
) -> Tuple[List[Tuple[Optional[str], str]], bool]:
"""Create positive examples based on the provided row from a DataFrame.

Returns a list of matching sentences or an empty list, along with a flag indicating if sentences were found in the JSON.
Returns a list of matching sentences or an empty list, along with a flag
indicating if sentences were found in the JSON.
"""
value: str = row["relevant_paragraphs"]
cleaned_value: str = self.clean_text(value)
Expand All @@ -128,9 +131,11 @@ def create_pos_examples(
return ([(None, "")], False) # Return with in_json_flag as False

source_page = str(row["source_page"])

match = re.search(r"\d+", source_page)
page_number = match.group() if match else None
if match:
page_number = match.group()
else:
return ([(None, "")], False)

if page_number in self.pdf_content:
matching_sentences = [
Expand All @@ -140,19 +145,43 @@ def create_pos_examples(
if any(sentence in para for sentence in sentences)
]

# Flag to know if sentence is available in json or not
in_json_flag = bool(matching_sentences)
return (
matching_sentences
if matching_sentences
else [(None, sentence) for sentence in sentences],
in_json_flag,
)
if matching_sentences:
# If matching sentences found, return them with in_json_flag as True
return matching_sentences, True

# If no exact match found, find the closest paragraph and its ID
closest_para, para_id = self._get_closest_paragraph(sentences, page_number)

if closest_para:
return [(para_id, closest_para)], True

return ([(None, "")], False) # Return with in_json_flag as False
# If no relevant paragraph found, return with in_json_flag as False
return ([(None, "")], False)

def _get_closest_paragraph(
self, sentences: List[str], page_number: str
) -> Tuple[Optional[str], Optional[str]]:
"""Find the closest paragraph on the given page and return it with its ID."""
closest_para = None
closest_para_id = None
min_distance = float("inf")

# Iterate over paragraphs on the page and compute Levenshtein distance
for key_inner in self.pdf_content[page_number]:
para = self.pdf_content[page_number][key_inner]["paragraph"]

# Compute the minimum distance between the target sentences and this paragraph
for sentence in sentences:
dist = levenshtein_distance(sentence, para)
if dist < min_distance:
min_distance = dist
closest_para = para
closest_para_id = key_inner # Store the unique paragraph ID

return closest_para, closest_para_id

def create_neg_examples(self, row: pd.Series) -> List[str]:
"""Create negative examples based on the provided row from a DataFrame.
"""Create negative examples excluding relevant paragraphs or close ones.

Returns a list of context paragraphs or an empty list.
"""
Expand All @@ -164,13 +193,32 @@ def create_neg_examples(self, row: pd.Series) -> List[str]:
):
return [""]

# Flatten all paragraphs from the PDF content
paragraphs = [
self.pdf_content[key_outer][key_inner]["paragraph"]
for key_outer in self.pdf_content
for key_inner in self.pdf_content[key_outer]
]

context = random.choices(paragraphs[1:], k=self.neg_pos_ratio)
relevant_paragraphs = row["relevant_paragraphs"]

# Filter out paragraphs that are identical or too similar
def is_similar(paragraph):
return any(
levenshtein_distance(paragraph, rel_para)
<= 5 # Adjust threshold if needed
for rel_para in relevant_paragraphs
)

negative_paragraphs = [p for p in paragraphs if not is_similar(p)]

# Randomly select `neg_sample_rate` paragraphs from the filtered list
context = (
random.choices(negative_paragraphs, k=self.neg_sample_rate)
if negative_paragraphs
else [""]
)

return context

def create_examples_annotate(self) -> List[pd.DataFrame]:
Expand Down Expand Up @@ -209,18 +257,21 @@ def create_examples_annotate(self) -> List[pd.DataFrame]:
(para, unique_para_id) for unique_para_id, para in pos_examples
]

contexts = [
(pos_contexts, 1),
(
[
(neg_example, None)
for neg_example in self.create_neg_examples(row.copy())
]
if self.create_neg_samples
else [("", None)],
0,
),
]
if self.create_neg_samples:
contexts = [
(pos_contexts, 1),
(
[
(neg_example, None)
for neg_example in self.create_neg_examples(row.copy())
]
if self.create_neg_samples
else [("", None)],
0,
),
]
else:
contexts = [(pos_contexts, 1)]

for context, label in contexts:
if (
Expand Down Expand Up @@ -278,17 +329,23 @@ def create_curator_df(self) -> pd.DataFrame:
kpi_df = pd.read_csv(
self.kpi_mapping_path, usecols=["kpi_id", "question"]
)

merged_df = pd.merge(new_df, kpi_df, on="kpi_id", how="left")

result_df = merged_df.rename(columns={"answer": "annotation_answer"})

result_df.loc[result_df["label"] == 0, "in_extraction_data_flag"] = (
bool(0)
)
# result_df.loc[result_df["label"] == 0, "in_extraction_data_flag"] = (
# bool(0)
# )
result_df.loc[
result_df["in_extraction_data_flag"] == 0, "unique_paragraph_id"
] = None

result_df["annotation_file_name"] = Path(self.annotation_folder).name
result_df["annotation_file_row"] += 2
result_df = result_df[
result_df["context"].notna() & (result_df["context"] != "")
]

# Reorder columns as specified in columns_order
result_df = result_df[columns_order]
Expand Down
25 changes: 16 additions & 9 deletions src/osc_transformer_presteps/run_local_relevance_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,20 @@ def run_local_curation(
kpi_mapping_file_path: str = typer.Argument(
help="This is the path to kpi_mapping.csv file"
),
output_path: str = typer.Argument(
help="Path to directory to save the output curated file.",
),
create_neg_samples: bool = typer.Option(
False,
"--create_neg_samples",
show_default=True,
help="Boolean to declare if you want to include negative samples in your dataset.",
),
neg_pos_ratio: int = typer.Option(
neg_sample_rate: int = typer.Option(
1,
"--neg_pos_ratio",
"--neg_sample_rate",
show_default=True,
help="Ratio of number of negative samples you want per positive samples.",
help="Number of negative samples you want per positive samples.",
),
logs_folder: str = typer.Option(
default=None,
Expand Down Expand Up @@ -152,15 +155,19 @@ def resolve_path(path_name: str, cwd: Path) -> Path:
annotation_file_path=annotation_temp,
kpi_mapping_file_path=kpi_mapping_temp,
create_neg_samples=create_neg_samples,
neg_pos_ratio=neg_pos_ratio,
neg_sample_rate=neg_sample_rate,
)
curated_data.to_csv("Curated_dataset.csv", index=False)
_logger.info(
f"Added info from file {extracted_json_temp.stem}.json to the curation file."
)

elif extracted_json_temp.is_dir():
files = [f for f in extracted_json_temp.iterdir() if f.is_file()]
files = [
f
for f in extracted_json_temp.iterdir()
if f.is_file() and f.name.endswith(".json")
]
curator_df = pd.DataFrame()

for file in files:
Expand All @@ -170,13 +177,13 @@ def resolve_path(path_name: str, cwd: Path) -> Path:
annotation_file_path=annotation_temp,
kpi_mapping_file_path=kpi_mapping_temp,
create_neg_samples=create_neg_samples,
neg_pos_ratio=neg_pos_ratio,
neg_sample_rate=neg_sample_rate,
)
curator_df = pd.concat([curator_df, temp_df], ignore_index=True)
_logger.info(f"Added info from file {file.stem}.json to the curation file.")

timestamp = datetime.now().strftime("%d%m%Y_%H%M")
csv_filename = f"Curated_dataset_{timestamp}.csv"
csv_filename = Path(output_path) / f"Curated_dataset_{timestamp}.csv"
curator_df.to_csv(csv_filename, index=False)

_logger.info("Curation ended.")
Expand All @@ -187,7 +194,7 @@ def curate_one_file(
annotation_file_path: Path,
kpi_mapping_file_path: Path,
create_neg_samples: bool,
neg_pos_ratio: int,
neg_sample_rate: int,
):
"""Curate data for a given file to a given folder for a specific setting.

Expand All @@ -198,7 +205,7 @@ def curate_one_file(
extract_json=dir_extracted_json_name,
kpi_mapping_path=kpi_mapping_file_path,
create_neg_samples=create_neg_samples,
neg_pos_ratio=neg_pos_ratio,
neg_sample_rate=neg_sample_rate,
).create_curator_df()


Expand Down
Loading