From 95c714d577c0e487f0d80727e60c6a56de085994 Mon Sep 17 00:00:00 2001 From: babu-namburi Date: Thu, 2 Jan 2025 17:45:00 +0530 Subject: [PATCH 1/2] Exception handling when reading the jsonl file. --- .../distillation/src/validate_pipeline.py | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/assets/training/distillation/src/validate_pipeline.py b/assets/training/distillation/src/validate_pipeline.py index 8172fdd764..0ff56cc55d 100644 --- a/assets/training/distillation/src/validate_pipeline.py +++ b/assets/training/distillation/src/validate_pipeline.py @@ -7,7 +7,7 @@ import pandas as pd import json from argparse import Namespace - +from pathlib import Path from azureml.acft.contrib.hf import VERSION, PROJECT_NAME from azureml.acft.contrib.hf.nlp.constants.constants import ( LOGS_TO_BE_FILTERED_IN_APPINSIGHTS, @@ -93,9 +93,39 @@ def __init__(self, args: Namespace) -> None: self._validate_data_generation_inputs() def _get_dataframe(self, file_path: str): - return pd.read_json( - file_path, lines=True, chunksize=self._args.request_batch_size - ) + if not Path(file_path).is_file(): + raise ACFTValidationException._with_error( + AzureMLError.create( + ACFTUserError, + pii_safe_message=( + f"File not found at {file_path}. Please provide a valid file path." + ), + ) + ) + try: + return pd.read_json( + file_path, lines=True, chunksize=self._args.request_batch_size + ) + except ValueError as e: + # If the file is not present pandas will read it as jsonl string and raises a ValueError if it is not a valid jsonl string. + # also raises value error if it is not a valid jsonl file. + raise ACFTValidationException._with_error( + AzureMLError.create( + ACFTUserError, + pii_safe_message=( + f"Error while reading JSON file. Make sure the file is a valid jsonl file. Error: {e}" + ), + ) + ) + except Exception as e: + raise ACFTValidationException._with_error( + AzureMLError.create( + ACFTUserError, + pii_safe_message=( + f"An unexpected error occurred while reading the file: {e}" + ), + ) + ) def _get_inference_request_headers(self) -> dict: key = self._args.teacher_model_endpoint_key From 3635bdb311e379fc0ec1bedca0781e7b67bdb625 Mon Sep 17 00:00:00 2001 From: babu-namburi Date: Thu, 2 Jan 2025 18:38:18 +0530 Subject: [PATCH 2/2] Fix flake errors --- assets/training/distillation/src/validate_pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/assets/training/distillation/src/validate_pipeline.py b/assets/training/distillation/src/validate_pipeline.py index 0ff56cc55d..a916ba7dea 100644 --- a/assets/training/distillation/src/validate_pipeline.py +++ b/assets/training/distillation/src/validate_pipeline.py @@ -107,7 +107,8 @@ def _get_dataframe(self, file_path: str): file_path, lines=True, chunksize=self._args.request_batch_size ) except ValueError as e: - # If the file is not present pandas will read it as jsonl string and raises a ValueError if it is not a valid jsonl string. + # If the file is not present pandas will read it as jsonl string + # raises a ValueError if it is not a valid jsonl string. # also raises value error if it is not a valid jsonl file. raise ACFTValidationException._with_error( AzureMLError.create(