-
Notifications
You must be signed in to change notification settings - Fork 130
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added Hellaswag HF's preprocessor logic as custom script. (#1631)
* Added pre-processor component and it's associated functionalities and methods for AML Benchmark * Added pre-processor component and it's associated functionalities and methods for AML Benchmark * Added pre-processor component and it's associated functionalities and methods for AML Benchmark * Added dataset pre-processor and inference post-processor (3P) components and it's associated functionalities and methods for AML Benchmark * Added dataset pre-processor and inference post-processor (3P) components and it's associated functionalities and methods for AML Benchmark * Added dataset pre-processor and inference post-processor (3P) components and it's associated functionalities and methods for AML Benchmark, 'safe to test' * Changed the import from test_utils instead of utils. * Added the modification to resolve the suggested the comments. * Added the modification to resolve the suggested the comments. * Updated the component version. * Added the modification and tests cases to resolve the suggested the comments. Added two new inputs to post processor component. * Added the modification and tests cases to resolve the suggested the comments. Added two new inputs to post processor component. * Added revised components and required functionalties in post-processor component to get shared as package. * Added revised components and required functionalties in post-processor component to get shared as package. * Added revised components and required functionalties in post-processor component to get shared as package. * Added revised components and required functionalties in post-processor component to get shared as package. * Added revised components and required functionalties in post-processor component to get shared as package. * branch containing changes to clone in devbox * branch containing changes to clone in devbox * Post processor component converged * Removed the unused files. * Removed the unused files. * Fixed the overwritten changes by git pull. * Changed the output path to comply with pipeline schema change. * Added the suggestions provided in PR review comments. * Added the suggestions provided in PR review comments. * removed unused files. * Fixed the custom script post processor method in the inference postprocessor component. * 1. Fixed custom script method call in post processor. 2. Added truthful_qa dataset custorm preprocessor. * 1. Fixed custom script method call in post processor. 2. Added truthful_qa dataset custorm preprocessor. * Added the reviewed comments provided by reviewers. * Added the reviewed comments provided by reviewers. * Added Hellaswag HF's preprocessor logic as custom script. * Added the suggestions provided by reviewer. --------- Co-authored-by: [email protected] <[email protected]>
- Loading branch information
1 parent
68c6220
commit c82da79
Showing
4 changed files
with
96 additions
and
1 deletion.
There are no files selected for viewing
83 changes: 83 additions & 0 deletions
83
assets/aml-benchmark/scripts/custom_dataset_preprocessors/hellaswag_hf.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
# --------------------------------------------------------- | ||
|
||
"""Custom preprocessor script for dataset:Hellaswag, source:HF.""" | ||
|
||
from typing import Any, Dict, List, Union | ||
import argparse | ||
import json | ||
import pandas as pd | ||
import re | ||
|
||
|
||
def _parse_args(): | ||
"""Parse the arguments.""" | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--input_path', type=str, required=True) | ||
parser.add_argument('--output_path', type=str, required=True) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def _read_jsonl_file(file_path: str) -> List[Dict[str, Any]]: | ||
"""Read `.jsonl` file and return a list of dictionaries.""" | ||
if not file_path.endswith(".jsonl"): | ||
mssg = f"Input file '{file_path}' is not a .jsonl file." | ||
raise ValueError(mssg) | ||
data_dicts = [] | ||
with open(file_path, 'r', encoding="utf8") as file: | ||
for i, line in enumerate(file): | ||
data_dicts.append(json.loads(line)) | ||
return data_dicts | ||
|
||
|
||
def _write_to_jsonl_file(data, file_path: str) -> None: | ||
"""Write the processed output to jsonl file.""" | ||
if isinstance(data, pd.DataFrame): | ||
data.to_json(file_path, lines=True, orient='records') | ||
return | ||
if isinstance(data, List): | ||
with open(file_path, 'w') as writer: | ||
for example in data: | ||
writer.write(json.dumps(example) + "\n") | ||
return | ||
|
||
|
||
def _run(input_path: str, output_path: str) -> None: | ||
"""Entry function to read, run and write the processed the data.""" | ||
data = _read_jsonl_file(input_path) | ||
processed_data = run_processor(data) | ||
_write_to_jsonl_file(processed_data, output_path) | ||
|
||
|
||
def preprocess(text: str) -> str: | ||
"""Clean the data.""" | ||
text = text.strip() | ||
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. We will clean these artifacts. | ||
text = text.replace(" [title]", ". ") | ||
# the re.sub("\\[.*?\\]", "", text) function will remove any text enclosed in square brackets from the | ||
# input string text. | ||
text = re.sub("\\[.*?\\]", "", text) | ||
text = text.replace(" ", " ") | ||
return text | ||
|
||
|
||
def run_processor(data: List[Dict[str, Any]]) -> Union[pd.DataFrame, List[Dict[str, Any]]]: | ||
"""Run the custom processor function.""" | ||
ret_data = [] | ||
for doc in data: | ||
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() | ||
out_doc = { | ||
"query": preprocess(doc["activity_label"] + ": " + ctx), | ||
"choices": [preprocess(ending) for ending in doc["endings"]], | ||
"gold": str(doc["label"]), | ||
} | ||
ret_data.append(out_doc) | ||
return ret_data | ||
|
||
|
||
if __name__ == '__main__': | ||
args = _parse_args() | ||
_run(args.input_path, args.output_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.