From 6096feb3e57d6fc0b2ef3605d669434f2f6a9f3a Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Wed, 27 Nov 2024 12:13:28 +0800 Subject: [PATCH] . --- c.py | 13 -- .../json_correctness/json_correctness.py | 143 ++++++++++++++---- deepeval/metrics/json_correctness/schema.py | 5 + deepeval/metrics/json_correctness/template.py | 6 +- 4 files changed, 124 insertions(+), 43 deletions(-) delete mode 100644 c.py create mode 100644 deepeval/metrics/json_correctness/schema.py diff --git a/c.py b/c.py deleted file mode 100644 index 9c4ba5831..000000000 --- a/c.py +++ /dev/null @@ -1,13 +0,0 @@ -from deepeval.metrics import JsonCorrectnessMetric - -from deepeval.metrics.faithfulness.schema import FaithfulnessVerdict, Verdicts -from deepeval.test_case import LLMTestCase - -metric = JsonCorrectnessMetric(expected_schema=Verdicts, verbose_mode=True) - -answer = """{\n"verdicts": [\n{\n"verdict": "yes"\n},\n{\n "verdict": "no",\n "reason": "blah blah"\n},' - '\n{\n "verdict": "yes",\n "reason":null \n}\n]\n}""" - -test_case = LLMTestCase(input="...", actual_output=answer) - -metric.measure(test_case=test_case) \ No newline at end of file diff --git a/deepeval/metrics/json_correctness/json_correctness.py b/deepeval/metrics/json_correctness/json_correctness.py index bea354aa3..ffc09ad77 100644 --- a/deepeval/metrics/json_correctness/json_correctness.py +++ b/deepeval/metrics/json_correctness/json_correctness.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Optional, Union import json from pydantic import BaseModel, ValidationError @@ -11,9 +11,16 @@ from deepeval.metrics.utils import ( construct_verbose_logs, check_llm_test_case_params, + initialize_model, + trimAndLoadJson, ) +from deepeval.models import DeepEvalBaseLLM from deepeval.metrics.indicator import metric_progress_indicator +from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate +from deepeval.metrics.json_correctness.schema import Reason +from deepeval.utils import get_or_create_event_loop +DEFAULT_CORRERCT_REASON = "The generated Json matches and is syntactically correct to the expected schema." required_params: List[LLMTestCaseParams] = [ LLMTestCaseParams.INPUT, @@ -25,14 +32,18 @@ class JsonCorrectnessMetric(BaseMetric): def __init__( self, expected_schema: BaseModel, + model: Optional[Union[str, DeepEvalBaseLLM]] = None, threshold: float = 0.5, + async_mode: bool = True, include_reason: bool = True, - strict_mode: bool = False, + strict_mode: bool = True, verbose_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold + self.model, self.using_native_model = initialize_model(model) self.include_reason = include_reason self.strict_mode = strict_mode + self.async_mode = async_mode self.verbose_mode = verbose_mode self.expected_schema = expected_schema @@ -45,8 +56,49 @@ def measure( test_case = test_case.turns[0] check_llm_test_case_params(test_case, required_params, self) - self.evaluation_cost = 0 + self.evaluation_cost = 0 if self.using_native_model else None with metric_progress_indicator(self, _show_indicator=_show_indicator): + if self.async_mode: + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) + else: + valid_json = True + try: + self.expected_schema.model_validate_json( + test_case.actual_output + ) + except ValidationError as e: + valid_json = False + + self.score = 1 if valid_json else 0 + self.reason = self.generate_reason(test_case.actual_output) + self.success = self.score >= self.threshold + self.verbose_logs = construct_verbose_logs( + self, + steps=[ + f"LLM outputed Json:\n{test_case.actual_output}", + # f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}", + f"Score: {self.score}\nReason: {self.reason}", + ], + ) + + return self.score + + async def a_measure( + self, + test_case: Union[LLMTestCase, ConversationalTestCase], + _show_indicator: bool = True, + ) -> float: + if isinstance(test_case, ConversationalTestCase): + test_case = test_case.turns[0] + check_llm_test_case_params(test_case, required_params, self) + + self.evaluation_cost = 0 if self.using_native_model else None + with metric_progress_indicator( + self, async_mode=True, _show_indicator=_show_indicator + ): valid_json = True try: self.expected_schema.model_validate_json( @@ -54,43 +106,80 @@ def measure( ) except ValidationError as e: valid_json = False - if self.include_reason: - self.reason = self.generate_friendly_error_message(e) self.score = 1 if valid_json else 0 + self.reason = await self.a_generate_reason(test_case.actual_output) self.success = self.score >= self.threshold self.verbose_logs = construct_verbose_logs( self, steps=[ f"LLM outputed Json:\n{test_case.actual_output}", - f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}", + # f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}", f"Score: {self.score}\nReason: {self.reason}", ], ) return self.score - async def a_measure( - self, - test_case: Union[LLMTestCase, ConversationalTestCase], - _show_indicator: bool = True, - ) -> float: - return self.measure(test_case, _show_indicator=_show_indicator) - - def generate_friendly_error_message(self, error: ValidationError) -> str: - error_messages = [] - for err in error.errors(): - # Extract error location, message, and type - loc = " -> ".join(map(str, err.get("loc", []))) - msg = err.get("msg", "Unknown error") - error_type = err.get("type", "Unknown type") - - # Format each error message in a readable way - error_message = f"Error in '{loc}': {msg} (Type: {error_type})" - error_messages.append(error_message) - - # Join all error messages into a single formatted string - return "\n".join(error_messages) + async def a_generate_reason(self, actual_output: str) -> str: + if self.include_reason is False: + return None + + is_valid_json = self.score == 1 + if is_valid_json: + return DEFAULT_CORRERCT_REASON + + prompt: dict = JsonCorrectnessTemplate.generate_reason( + actual_output=actual_output, + expected_schema=json.dumps( + self.expected_schema.model_json_schema(), indent=4 + ), + is_valid_json=is_valid_json, + ) + + if self.using_native_model: + res, cost = await self.model.a_generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + return data["reason"] + else: + try: + res: Reason = await self.model.a_generate(prompt, schema=Reason) + return res.reason + except TypeError: + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res, self) + return data["reason"] + + def generate_reason(self, actual_output: str) -> str: + if self.include_reason is False: + return None + + is_valid_json = self.score == 1 + if is_valid_json: + return DEFAULT_CORRERCT_REASON + + prompt: dict = JsonCorrectnessTemplate.generate_reason( + actual_output=actual_output, + expected_schema=json.dumps( + self.expected_schema.model_json_schema(), indent=4 + ), + is_valid_json=is_valid_json, + ) + + if self.using_native_model: + res, cost = self.model.generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + return data["reason"] + else: + try: + res: Reason = self.model.generate(prompt, schema=Reason) + return res.reason + except TypeError: + res = self.model.generate(prompt) + data = trimAndLoadJson(res, self) + return data["reason"] def is_successful(self) -> bool: if self.error is not None: diff --git a/deepeval/metrics/json_correctness/schema.py b/deepeval/metrics/json_correctness/schema.py new file mode 100644 index 000000000..941e00473 --- /dev/null +++ b/deepeval/metrics/json_correctness/schema.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class Reason(BaseModel): + reason: str diff --git a/deepeval/metrics/json_correctness/template.py b/deepeval/metrics/json_correctness/template.py index c4aa02868..7a5932a40 100644 --- a/deepeval/metrics/json_correctness/template.py +++ b/deepeval/metrics/json_correctness/template.py @@ -4,7 +4,7 @@ class JsonCorrectnessTemplate: @staticmethod def generate_reason( - generated_json: str, expected_schema: str, is_valid_json: bool + actual_output: str, expected_schema: str, is_valid_json: bool ): return f"""Based on the given generated json, generated by an LLM, and a boolean stating whether it is a valid JSON based on the expected json schema, give a reason why it is OR is not a valid Json. @@ -15,11 +15,11 @@ def generate_reason( "reason": "The generated Json is because ." }} -If the json is not a valid one, your reason MUST compare `Expected Json Schema` and `Generated Json` in your reason. +If the json is not a valid one, your reason MUST compare `Expected Json Schema` and `Generated Json` in your reason. Keep it SHORT and CONCISE while being very FACTUAL and ACTIONABLE. ** Generated Json: -{generated_json} +{actual_output} Expected Json Schema: {expected_schema}