diff --git a/a.py b/a.py index 44946d4f3..e43247d75 100644 --- a/a.py +++ b/a.py @@ -1,34 +1,58 @@ -from deepeval import evaluate -from deepeval.metrics import RoleAdherenceMetric -from deepeval.test_case import ConversationalTestCase, LLMTestCase +from deepeval.test_case import LLMTestCase, ConversationalTestCase +from deepeval.metrics import KnowledgeRetentionMetric -test_case = ConversationalTestCase( - chatbot_role="You are a cheerful robot who’s always optimistic, even when things are going wrong. You constantly spin situations in a positive light, always finding the silver lining no matter what.", - turns=[ - LLMTestCase( - input="Hey, are you there?", - actual_output="Hi there! Yep, I’m right here! Ready to brighten up your day!", - ), - LLMTestCase( - input="I lost my keys... again.", - actual_output="Oh no, but hey, look at it this way—you might just find something else you thought you lost while looking for them!", - ), - LLMTestCase( - input="Yeah, maybe... but it still sucks.", - actual_output="I understand, but it's a great opportunity to tidy up a bit! And who knows, those keys will probably show up when you least expect it!", - ), - LLMTestCase( - input="I stepped in a puddle on my way home.", - actual_output="Oh yikes, soggy socks! But think of it as a refreshing little foot bath! And now you’re home, warm and dry!", - ), - LLMTestCase( - input="Now my phone's not working.", - actual_output="Ugh, seriously? Your phone's broken again? That's the worst, dude! I'd chuck it out the window if I were you.", - ), - ], -) -metric = RoleAdherenceMetric(verbose_mode=True) +turns = [ + LLMTestCase( + input="", + actual_output="Hello! I'm here to assist you with opening a new bank account. To start, may I have your full name, please?", + ), + LLMTestCase( + input="Sure, it's Alex Johnson. I also have a dog called Jacky.", + actual_output="Great, Alex! Now, could you please provide your current address?", + ), + LLMTestCase( + input="123 Maple Street, Springfield.", actual_output="Is that all?" + ), + LLMTestCase( + input="I also have another address at 123 Broadway, NYC.", + actual_output="Wonderful. Next, I'll need your bank account number where we can link this new account.", + ), + LLMTestCase( + input="456789123", + actual_output="This account number seems invalid, can you please double-check?", + ), + LLMTestCase( + input="Sorry you're right, its 456789124.", + actual_output="Thank you. And for our next step, could you tell me your date of birth?", + ), + LLMTestCase(input="It's July 9th.", actual_output="What about the year?"), + LLMTestCase( + input="1990", + actual_output="Got it. Now, for security purposes, could you share your mother's maiden name?", + ), + LLMTestCase( + input="It's Smith.", + actual_output="Excellent. Just a few more details. What is your phone number?", + ), + LLMTestCase( + input="555-0102", + actual_output="Great, we're almost done. Could you remind me of your full name for the account documentation?", + ), + LLMTestCase( + input="Didn't I tell you already? It's Alex Johnson.", + actual_output="What is your bank account number?", + ), + LLMTestCase( + input="Yes, I did... It's 456789124. Are you not keeping track of this?", + actual_output="One last question, what is the country code of your phone number?", + ), + LLMTestCase( + input="+44", + actual_output="Thank you, Alex, for bearing with me. We now have all the information we need to proceed with opening your new bank account. I appreciate your cooperation and patience throughout this process.", + ), +] -evaluate(test_cases=[test_case], metrics=[metric]) +test_case = ConversationalTestCase(turns=turns) +metric = KnowledgeRetentionMetric(threshold=0.5, verbose_mode=True) -# metric.measure(test_case=test_case) +metric.measure(test_case) diff --git a/deepeval/metrics/knowledge_retention/knowledge_retention.py b/deepeval/metrics/knowledge_retention/knowledge_retention.py index adcdbbda6..6971106bd 100644 --- a/deepeval/metrics/knowledge_retention/knowledge_retention.py +++ b/deepeval/metrics/knowledge_retention/knowledge_retention.py @@ -1,9 +1,14 @@ -from typing import Optional, Union, Dict, List -from pydantic import BaseModel, Field +from typing import Optional, Union, List -from deepeval.test_case import ConversationalTestCase +from deepeval.test_case import ( + ConversationalTestCase, + LLMTestCase, + LLMTestCaseParams, +) from deepeval.metrics import BaseConversationalMetric from deepeval.metrics.utils import ( + check_conversational_test_case_params, + construct_verbose_logs, trimAndLoadJson, initialize_model, ) @@ -12,7 +17,17 @@ KnowledgeRetentionTemplate, ) from deepeval.metrics.indicator import metric_progress_indicator -from deepeval.metrics.knowledge_retention.schema import * +from deepeval.metrics.knowledge_retention.schema import ( + Knowledge, + KnowledgeRetentionVerdict, + Reason, +) +from deepeval.utils import get_or_create_event_loop, prettify_list + +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, +] class KnowledgeRetentionMetric(BaseConversationalMetric): @@ -21,32 +36,81 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + async_mode: bool = True, strict_mode: bool = False, + verbose_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold self.model, self.using_native_model = initialize_model(model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason + self.async_mode = async_mode self.strict_mode = strict_mode + self.verbose_mode = verbose_mode def measure( self, test_case: ConversationalTestCase, _show_indicator: bool = True ): + check_conversational_test_case_params(test_case, required_params, self) + + self.evaluation_cost = 0 if self.using_native_model else None with metric_progress_indicator(self, _show_indicator=_show_indicator): - self.knowledges: List[Knowledge] = self._generate_knowledges( - test_case + if self.async_mode: + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) + else: + self.knowledges: List[Knowledge] = self._generate_knowledges( + test_case.turns + ) + self.verdicts: List[KnowledgeRetentionVerdict] = ( + self._generate_verdicts(test_case.turns) + ) + self.score = self._calculate_score() + self.reason = self._generate_reason() + self.success = self.score >= self.threshold + self.verbose_logs = construct_verbose_logs( + self, + steps=[ + f"Knowledges:\n{prettify_list(self.knowledges)}", + f"Verdicts:\n{prettify_list(self.verdicts)}", + f"Score: {self.score}\nReason: {self.reason}", + ], + ) + return self.score + + async def a_measure( + self, + test_case: ConversationalTestCase, + _show_indicator: bool = True, + ) -> float: + check_conversational_test_case_params(test_case, required_params, self) + + self.evaluation_cost = 0 if self.using_native_model else None + with metric_progress_indicator( + self, async_mode=True, _show_indicator=_show_indicator + ): + self.knowledges: List[Knowledge] = ( + await self._a_generate_knowledges(test_case.turns) ) self.verdicts: List[KnowledgeRetentionVerdict] = ( - self._generate_verdicts(test_case) + await self._a_generate_verdicts(test_case.turns) + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason() + self.success = self.score >= self.threshold + self.verbose_logs = construct_verbose_logs( + self, + steps=[ + f"Knowledges:\n{prettify_list(self.knowledges)}", + f"Verdicts:\n{prettify_list(self.verdicts)}", + f"Score: {self.score}\nReason: {self.reason}", + ], ) - knowledge_retention_score = self._calculate_score() - self.reason = self._generate_reason(knowledge_retention_score) - - self.success = knowledge_retention_score >= self.threshold - self.score = knowledge_retention_score return self.score - def _generate_reason(self, score: float) -> str: + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -57,33 +121,86 @@ def _generate_reason(self, score: float) -> str: prompt: dict = KnowledgeRetentionTemplate.generate_reason( attritions=attritions, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) if self.using_native_model: - res, _ = self.model.generate(prompt) + res, cost = await self.model.a_generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + return data["reason"] else: - res = self.model.generate(prompt) - return res + try: + res: Reason = await self.model.a_generate(prompt, schema=Reason) + return res.reason + except TypeError: + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res, self) + return data["reason"] - def _calculate_score(self) -> float: - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 + def _generate_reason(self) -> str: + if self.include_reason is False: + return None - retention_count = 0 + attritions = [] for verdict in self.verdicts: - if verdict.verdict.strip().lower() == "no": - retention_count += 1 + if verdict.verdict.strip().lower() == "yes": + attritions.append(verdict.reason) - score = retention_count / number_of_verdicts + prompt: dict = KnowledgeRetentionTemplate.generate_reason( + attritions=attritions, + score=format(self.score, ".2f"), + ) + if self.using_native_model: + res, cost = self.model.generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + return data["reason"] + else: + try: + res: Reason = self.model.generate(prompt, schema=Reason) + return res.reason + except TypeError: + res = self.model.generate(prompt) + data = trimAndLoadJson(res, self) + return data["reason"] - return 0 if self.strict_mode and score < self.threshold else score + async def _a_generate_verdicts( + self, llm_test_cases: List[LLMTestCase] + ) -> List[KnowledgeRetentionVerdict]: + verdicts: List[KnowledgeRetentionVerdict] = [] + for index, llm_test_case in enumerate(llm_test_cases): + previous_knowledge = self.knowledges[index].data + + prompt = KnowledgeRetentionTemplate.generate_verdict( + llm_message=llm_test_case.actual_output, + previous_knowledge=previous_knowledge, + ) + if self.using_native_model: + res, cost = await self.model.a_generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + verdict = KnowledgeRetentionVerdict(index=index, **data) + else: + try: + verdict: KnowledgeRetentionVerdict = ( + await self.model.a_generate( + prompt, schema=KnowledgeRetentionVerdict + ) + ) + verdict.index = index + except TypeError: + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res, self) + verdict = KnowledgeRetentionVerdict(index=index, **data) + verdicts.append(verdict) + + return verdicts def _generate_verdicts( - self, test_case: ConversationalTestCase + self, llm_test_cases: List[LLMTestCase] ) -> List[KnowledgeRetentionVerdict]: verdicts: List[KnowledgeRetentionVerdict] = [] - for index, llm_test_case in enumerate(test_case.turns): + for index, llm_test_case in enumerate(llm_test_cases): previous_knowledge = self.knowledges[index].data prompt = KnowledgeRetentionTemplate.generate_verdict( @@ -91,23 +208,66 @@ def _generate_verdicts( previous_knowledge=previous_knowledge, ) if self.using_native_model: - res, _ = self.model.generate(prompt) + res, cost = self.model.generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + verdict = KnowledgeRetentionVerdict(index=index, **data) else: - res = self.model.generate(prompt) - data = trimAndLoadJson(res, self) - verdict = KnowledgeRetentionVerdict(index=index, **data) + try: + verdict: KnowledgeRetentionVerdict = self.model.generate( + prompt, schema=KnowledgeRetentionVerdict + ) + verdict.index = index + except TypeError: + res = self.model.generate(prompt) + data = trimAndLoadJson(res, self) + verdict = KnowledgeRetentionVerdict(index=index, **data) verdicts.append(verdict) return verdicts + async def _a_generate_knowledges( + self, llm_test_cases: List[LLMTestCase] + ) -> List[Knowledge]: + knowledges: List[Knowledge] = [] + for index, llm_test_case in enumerate(llm_test_cases): + previous_knowledge = knowledges[-1].data if knowledges else {} + llm_turn = ( + llm_test_cases[index - 1].actual_output if index > 0 else "" + ) + + prompt = KnowledgeRetentionTemplate.extract_data( + llm_message=llm_turn, + user_message=llm_test_case.input, + previous_knowledge=previous_knowledge, + ) + + if self.using_native_model: + res, cost = await self.model.a_generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + knowledge: Knowledge = Knowledge(data=data) + else: + try: + knowledge: Knowledge = await self.model.a_generate( + prompt, schema=Knowledge + ) + except TypeError: + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res, self) + knowledge = Knowledge(data=data) + knowledges.append(knowledge) + + return knowledges + def _generate_knowledges( - self, test_case: ConversationalTestCase + self, llm_test_cases: List[LLMTestCase] ) -> List[Knowledge]: knowledges: List[Knowledge] = [] - for index, llm_test_case in enumerate(test_case.turns): + for index, llm_test_case in enumerate(llm_test_cases): previous_knowledge = knowledges[-1].data if knowledges else {} llm_turn = ( - test_case.turns[index - 1].actual_output if index > 0 else "" + llm_test_cases[index - 1].actual_output if index > 0 else "" ) prompt = KnowledgeRetentionTemplate.extract_data( @@ -117,15 +277,37 @@ def _generate_knowledges( ) if self.using_native_model: - res, _ = self.model.generate(prompt) + res, cost = self.model.generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + knowledge: Knowledge = Knowledge(data=data) else: - res = self.model.generate(prompt) - data = trimAndLoadJson(res, self) - knowledge = Knowledge(data=data) + try: + knowledge: Knowledge = self.model.generate( + prompt, schema=Knowledge + ) + except TypeError: + res = self.model.generate(prompt) + data = trimAndLoadJson(res, self) + knowledge = Knowledge(data=data) knowledges.append(knowledge) return knowledges + def _calculate_score(self) -> float: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 0 + + retention_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "no": + retention_count += 1 + + score = retention_count / number_of_verdicts + + return 0 if self.strict_mode and score < self.threshold else score + def is_successful(self) -> bool: if self.error is not None: self.success = False diff --git a/deepeval/metrics/knowledge_retention/schema.py b/deepeval/metrics/knowledge_retention/schema.py index 8156bf25c..bcbf66e0e 100644 --- a/deepeval/metrics/knowledge_retention/schema.py +++ b/deepeval/metrics/knowledge_retention/schema.py @@ -1,5 +1,5 @@ from typing import List, Dict, Union, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel class Knowledge(BaseModel): @@ -7,6 +7,10 @@ class Knowledge(BaseModel): class KnowledgeRetentionVerdict(BaseModel): - index: int verdict: str - reason: Optional[str] = Field(default=None) + index: Optional[int] = None + reason: Optional[str] = None + + +class Reason(BaseModel): + reason: str diff --git a/deepeval/metrics/role_adherence/role_adherence.py b/deepeval/metrics/role_adherence/role_adherence.py index 20a300829..0e9201e69 100644 --- a/deepeval/metrics/role_adherence/role_adherence.py +++ b/deepeval/metrics/role_adherence/role_adherence.py @@ -1,5 +1,4 @@ -import asyncio -from typing import Optional, Union, Dict, List +from typing import Optional, Union, List from deepeval.metrics import BaseConversationalMetric from deepeval.metrics.role_adherence.schema import ( diff --git a/docs/sidebars.js b/docs/sidebars.js index a820ed629..c974da91c 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -33,23 +33,23 @@ module.exports = { "metrics-toxicity", "metrics-ragas", "metrics-custom", - { - type: "category", - label: "Multimodal Metrics", - items: ["metrics-viescore"], - collapsed: true, - }, { type: "category", label: "Conversational Metrics", items: [ "metrics-role-adherence", + "metrics-knowledge-retention", "metrics-conversation-completeness", "metrics-conversation-relevancy", - "metrics-knowledge-retention", ], collapsed: true, }, + { + type: "category", + label: "Multimodal Metrics", + items: ["metrics-viescore"], + collapsed: true, + }, ], collapsed: false, },