Merge pull request #945 from confident-ai/release-v0.21.75

Fix tool correctness
confident-ai · Aug 9, 2024 · fe2f6f1 · fe2f6f1
2 parents 1b48c66 + 8c139a0
commit fe2f6f1
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 29 deletions.
diff --git a/deepeval/metrics/tool_correctness/tool_correctness.py b/deepeval/metrics/tool_correctness/tool_correctness.py
@@ -1,14 +1,13 @@
-from typing import Set, List, Union
+from typing import List
 
-from deepeval.utils import prettify_list
+from deepeval.utils import prettify_list, get_lcs
 from deepeval.metrics.utils import (
     construct_verbose_logs,
     check_llm_test_case_params,
 )
 from deepeval.test_case import (
     LLMTestCase,
     LLMTestCaseParams,
-    ConversationalTestCase,
 )
 from deepeval.metrics import BaseMetric
 
@@ -27,28 +26,28 @@ def __init__(
         include_reason: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
+        should_exact_match: bool = False,
+        should_consider_ordering: bool = False,
     ):
         self.threshold = 1 if strict_mode else threshold
         self.include_reason = include_reason
         self.strict_mode = strict_mode
         self.verbose_mode = verbose_mode
+        self.should_exact_match = should_exact_match
+        self.should_consider_ordering = should_consider_ordering
 
     def measure(self, test_case: LLMTestCase) -> float:
         check_llm_test_case_params(test_case, required_params, self)
-
-        self.tools_used: Set[str] = set(test_case.tools_used)
-        self.expected_tools: Set[str] = set(test_case.expected_tools)
-        self.expected_tools_used = self.tools_used.intersection(
-            self.expected_tools
-        )
+        self.tools_used: List[str] = test_case.tools_used
+        self.expected_tools: List[str] = test_case.expected_tools
         self.score = self._calculate_score()
         self.reason = self._generate_reason()
         self.success = self.score >= self.threshold
         self.verbose_logs = construct_verbose_logs(
             self,
             steps=[
-                f"Expected Tools:\n{prettify_list(list(self.expected_tools))}",
-                f"Expected Tools Used:\n{prettify_list(list(self.expected_tools_used))}",
+                f"Expected Tools:\n{prettify_list(self.expected_tools)}",
+                f"Tools Used:\n{prettify_list(self.tools_used)}",
                 f"Score: {self.score}\nReason: {self.reason}",
             ],
         )
@@ -58,23 +57,51 @@ async def a_measure(self, test_case: LLMTestCase) -> float:
         return self.measure(test_case)
 
     def _generate_reason(self):
-        reason = f"The score is {self.score} because {len(self.expected_tools_used)} out of {len(self.expected_tools)} expected tools were used. "
-        tools_unused = list(self.expected_tools - self.expected_tools_used)
-        if len(tools_unused) > 0:
-            reason += f""
-            reason += (
-                f"Tool {tools_unused} was "
-                if len(tools_unused) == 1
-                else f"Tools {tools_unused} were "
+        if self.should_exact_match:
+            return f"{'Exact match' if self.tools_used == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_used}."
+
+        elif self.should_consider_ordering:
+            lcs = get_lcs(self.expected_tools, self.tools_used)
+            missing = set(self.expected_tools) - set(self.tools_used)
+            out_of_order = set(self.expected_tools) - set(lcs)
+
+            if len(lcs) == len(self.expected_tools):
+                return f"Correct ordering: all expected tools {self.expected_tools} were called in the correct order."
+            else:
+                issues = []
+                if missing:
+                    issues.append(f"missing tools {list(missing)}")
+                if out_of_order:
+                    issues.append(f"out-of-order tools {list(out_of_order)}")
+                return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_used}."
+
+        else:
+            used_expected = set(self.tools_used).intersection(
+                set(self.expected_tools)
             )
-            reason += "expected but not used."
+            missing = set(self.expected_tools) - used_expected
 
-        return reason
+            if len(used_expected) == len(self.expected_tools):
+                return f"All expected tools {self.expected_tools} were called (order not considered)."
+            else:
+                return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_used}."
 
     def _calculate_score(self):
-        number_of_expected_tools_used = len(self.expected_tools_used)
-        number_of_expected_tools = len(self.expected_tools)
-        score = number_of_expected_tools_used / number_of_expected_tools
+        if self.should_exact_match:
+            return 1.0 if self.tools_used == self.expected_tools else 0.0
+
+        elif self.should_consider_ordering:
+            longest_common_subsequence = get_lcs(
+                self.expected_tools, self.tools_used
+            )
+            score = len(longest_common_subsequence) / len(self.expected_tools)
+
+        else:
+            used_expected_tools = set(self.tools_used).intersection(
+                set(self.expected_tools)
+            )
+            score = len(used_expected_tools) / len(self.expected_tools)
+
         return 0 if self.strict_mode and score < self.threshold else score
 
     def is_successful(self) -> bool:

diff --git a/deepeval/utils.py b/deepeval/utils.py
@@ -20,6 +20,33 @@
 from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER
 
 
+def get_lcs(seq1, seq2):
+    m, n = len(seq1), len(seq2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if seq1[i - 1] == seq2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+
+    # Reconstruct the LCS
+    lcs = []
+    i, j = m, n
+    while i > 0 and j > 0:
+        if seq1[i - 1] == seq2[j - 1]:
+            lcs.append(seq1[i - 1])
+            i -= 1
+            j -= 1
+        elif dp[i - 1][j] > dp[i][j - 1]:
+            i -= 1
+        else:
+            j -= 1
+
+    return lcs[::-1]
+
+
 def camel_to_snake(name: str) -> str:
     s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
     return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()

diff --git a/docs/docs/metrics-introduction.mdx b/docs/docs/metrics-introduction.mdx
@@ -22,9 +22,9 @@ In `deepeval`, a metric serves as a standard of measurement for evaluating the p
 
 `deepeval` also offers conversational metrics, which are metrics used to evaluate conversations instead of individual, granular LLM interactions. These include:
 
-- Knowledge Retention
+- Conversation Completeness
 - Conversation Relevancy
-- Conversation Coherence
+- Knowledge Retention
 
 You can also easily develop your own custom evaluation metrics in `deepeval`. All metrics are measured on a test case. Visit the [test cases section](evaluation-test-cases) to learn how to apply any metric on test cases for evaluation.
 

diff --git a/docs/docs/metrics-tool-correctness.mdx b/docs/docs/metrics-tool-correctness.mdx
@@ -31,9 +31,9 @@ from deepeval.test_case import LLMTestCase
 metric = ToolCorrectnessMetric()
 test_case = LLMTestCase(
     input="What if these shoes don't fit?",
-    actual_output="We offer a 30-day full refund at no extra cost."
+    actual_output="We offer a 30-day full refund at no extra cost.",
     # Replace this with the tools that was actually used by your LLM agent
-    tools_used=["WebSearch"]
+    tools_used=["WebSearch"],
     expected_tools=["WebSearch", "ToolQuery"]
 )
 
@@ -48,6 +48,12 @@ There are four optional parameters when creating a `ToolCorrectnessMetric`:
 - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
 - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`.
 - [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`.
+- [Optional] `should_consider_ordering`: a boolean which when set to `True`, will consider the ordering in which the tools were called in. For example, if `expected_tools=["WebSearch", "ToolQuery", "WebSearch"]` and `tools_used=["WebSearch", "WebSearch"]`, the metric will consider the tool calling to be correct. Defaulted to `False`.
+- [Optional] `should_exact_match`: a boolean which when set to `True`, will required the `tools_used` and `expected_tools` to be exactly the same. Defaulted to `False`.
+
+:::note
+Since `should_exact_match` is a stricter criteria than `should_consider_ordering`, setting `should_consider_ordering` will have no effect when `should_exact_match` is set to `True`.
+:::
 
 ## How Is It Calculated?
 
@@ -62,4 +68,4 @@ The **tool correctness metric** score is calculated according to the following e
 "
 />
 
-This metric assesses the accuracy of your agent's tool usage by comparing the `tools_used` by your LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent matches the expected tools, while a score of 0 signifies that none of the used tools were among the expected tools.
+This metric assesses the accuracy of your agent's tool usage by comparing the `tools_used` by your LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent were called correctly according to the list of `expected_tools`, `should_consider_ordering`, and `should_exact_match`, while a score of 0 signifies that none of the `tools_used` were called correctly.