From a634d6716d523be9371f1146411c02a308b98d36 Mon Sep 17 00:00:00 2001 From: Kamil Tagowski Date: Thu, 24 Aug 2023 11:16:39 +0200 Subject: [PATCH] feat(qa_evaluator): Improve QA evaluator --- embeddings/evaluator/evaluation_results.py | 2 ++ .../evaluator/question_answering_evaluator.py | 21 +++++++++++-- .../pipeline/lightning_question_answering.py | 17 +++++++++++ ...uestion_answering_output_transformation.py | 2 +- embeddings/utils/utils.py | 30 +++++++++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/embeddings/evaluator/evaluation_results.py b/embeddings/evaluator/evaluation_results.py index ef160b18..b8cc08ed 100644 --- a/embeddings/evaluator/evaluation_results.py +++ b/embeddings/evaluator/evaluation_results.py @@ -110,3 +110,5 @@ class QuestionAnsweringEvaluationResults(EvaluationResults): NoAns_f1: Optional[float] = None NoAns_total: Optional[float] = None data: Optional[Data] = None + golds_text: Optional[Union[List[List[str]], List[str]]] = None + predictions_text: Optional[List[str]] = None diff --git a/embeddings/evaluator/question_answering_evaluator.py b/embeddings/evaluator/question_answering_evaluator.py index c37b90d1..69614be1 100644 --- a/embeddings/evaluator/question_answering_evaluator.py +++ b/embeddings/evaluator/question_answering_evaluator.py @@ -26,9 +26,22 @@ def __init__(self, no_answer_threshold: float = 1.0): def metrics( self, - ) -> Dict[str, Metric[Union[List[Any], nptyping.NDArray[Any], torch.Tensor], Dict[Any, Any]]]: + ) -> Dict[str, Metric[Union[List[Any], nptyping.NDArray[Any], torch.Tensor], Dict[Any, Any]],]: return {} + @staticmethod + def get_golds_text(references: List[QA_GOLD_ANSWER_TYPE]) -> Union[List[List[str]], List[str]]: + golds_text = [] + for ref in references: + answers = ref["answers"] + assert isinstance(answers, dict) + golds_text.append(answers["text"]) + return golds_text + + @staticmethod + def get_predictions_text(predictions: List[QA_PREDICTED_ANSWER_TYPE]) -> List[str]: + return [str(it["prediction_text"]) for it in predictions] + def evaluate( self, data: Union[Dict[str, nptyping.NDArray[Any]], Predictions, Dict[str, Any]] ) -> QuestionAnsweringEvaluationResults: @@ -51,5 +64,9 @@ def evaluate( {"id": it_id, **it["predicted_answer"]} for it_id, it in enumerate(outputs) ] metrics = SQUADv2Metric().calculate(predictions=predictions, references=references) + gold_texts = QuestionAnsweringEvaluator.get_golds_text(references) + predictions_text = QuestionAnsweringEvaluator.get_predictions_text(predictions) - return QuestionAnsweringEvaluationResults(data=outputs, **metrics) + return QuestionAnsweringEvaluationResults( + data=outputs, golds_text=gold_texts, predictions_text=predictions_text, **metrics + ) diff --git a/embeddings/pipeline/lightning_question_answering.py b/embeddings/pipeline/lightning_question_answering.py index e5a9e2f0..2681b989 100644 --- a/embeddings/pipeline/lightning_question_answering.py +++ b/embeddings/pipeline/lightning_question_answering.py @@ -2,6 +2,8 @@ from typing import Any, Dict, List, Optional, Union import datasets +import pandas as pd +import yaml from pytorch_lightning.accelerators import Accelerator from embeddings.config.lightning_config import LightningQABasicConfig, LightningQAConfig @@ -14,6 +16,7 @@ from embeddings.pipeline.lightning_pipeline import LightningPipeline from embeddings.task.lightning_task import question_answering as qa from embeddings.utils.loggers import LightningLoggingConfig +from embeddings.utils.utils import convert_qa_df_to_bootstrap_html class LightningQuestionAnsweringPipeline( @@ -86,3 +89,17 @@ def __init__( logging_config, pipeline_kwargs=pipeline_kwargs, ) + + def _save_metrics(self) -> None: + metrics = getattr(self.result, "metrics") + with open(self.output_path / "metrics.yaml", "w") as f: + yaml.dump(metrics, stream=f) + + predictions_text = getattr(self.result, "predictions_text") + golds_text = getattr(self.result, "golds_text") + with open(self.output_path / "predictions.html", "w") as f: + f.write( + convert_qa_df_to_bootstrap_html( + pd.DataFrame({"predictions": predictions_text, "golds": golds_text}) + ) + ) diff --git a/embeddings/transformation/lightning_transformation/question_answering_output_transformation.py b/embeddings/transformation/lightning_transformation/question_answering_output_transformation.py index 5a5cff05..46be5952 100644 --- a/embeddings/transformation/lightning_transformation/question_answering_output_transformation.py +++ b/embeddings/transformation/lightning_transformation/question_answering_output_transformation.py @@ -127,7 +127,7 @@ def _get_predicted_text_from_context( def _get_softmax_scores_with_sort(predictions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: scores = torch.from_numpy(np.array([pred.pop("score") for pred in predictions])) # Module torch.functional does not explicitly export attritube "F" - softmax_scores = torch.functional.F.softmax(scores) # type: ignore[attr-defined] + softmax_scores = torch.functional.F.softmax(scores, dim=0) # type: ignore[attr-defined] for prob, pred in zip(softmax_scores, predictions): pred["softmax_score"] = prob # mypy thinks the function only returns Any diff --git a/embeddings/utils/utils.py b/embeddings/utils/utils.py index 923cd487..0bb8774f 100644 --- a/embeddings/utils/utils.py +++ b/embeddings/utils/utils.py @@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np +import pandas as pd import pkg_resources import requests import yaml @@ -152,3 +153,32 @@ def compress_and_remove(filepath: T_path) -> None: ) as arc: arc.write(filepath, arcname=filepath.name) filepath.unlink() + + +def convert_qa_df_to_bootstrap_html(df: pd.DataFrame) -> str: + boostrap_cdn = ( + '' + ) + + output = ( + "" + + "\n" + + "" + + "\n" + + "" + + "\n" + + boostrap_cdn + + "\n" + + '' + + "\n" + + "" + + "\n" + + "" + + "\n" + + df.to_html(classes=["table table-bordered table-striped table-hover"]) + + "\n" + + "" + ) + assert isinstance(output, str) + return output