Skip to content

Commit

Permalink
Merge pull request #945 from confident-ai/release-v0.21.75
Browse files Browse the repository at this point in the history
Fix tool correctness
  • Loading branch information
penguine-ip authored Aug 9, 2024
2 parents 1b48c66 + 8c139a0 commit fe2f6f1
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 29 deletions.
75 changes: 51 additions & 24 deletions deepeval/metrics/tool_correctness/tool_correctness.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from typing import Set, List, Union
from typing import List

from deepeval.utils import prettify_list
from deepeval.utils import prettify_list, get_lcs
from deepeval.metrics.utils import (
construct_verbose_logs,
check_llm_test_case_params,
)
from deepeval.test_case import (
LLMTestCase,
LLMTestCaseParams,
ConversationalTestCase,
)
from deepeval.metrics import BaseMetric

Expand All @@ -27,28 +26,28 @@ def __init__(
include_reason: bool = True,
strict_mode: bool = False,
verbose_mode: bool = False,
should_exact_match: bool = False,
should_consider_ordering: bool = False,
):
self.threshold = 1 if strict_mode else threshold
self.include_reason = include_reason
self.strict_mode = strict_mode
self.verbose_mode = verbose_mode
self.should_exact_match = should_exact_match
self.should_consider_ordering = should_consider_ordering

def measure(self, test_case: LLMTestCase) -> float:
check_llm_test_case_params(test_case, required_params, self)

self.tools_used: Set[str] = set(test_case.tools_used)
self.expected_tools: Set[str] = set(test_case.expected_tools)
self.expected_tools_used = self.tools_used.intersection(
self.expected_tools
)
self.tools_used: List[str] = test_case.tools_used
self.expected_tools: List[str] = test_case.expected_tools
self.score = self._calculate_score()
self.reason = self._generate_reason()
self.success = self.score >= self.threshold
self.verbose_logs = construct_verbose_logs(
self,
steps=[
f"Expected Tools:\n{prettify_list(list(self.expected_tools))}",
f"Expected Tools Used:\n{prettify_list(list(self.expected_tools_used))}",
f"Expected Tools:\n{prettify_list(self.expected_tools)}",
f"Tools Used:\n{prettify_list(self.tools_used)}",
f"Score: {self.score}\nReason: {self.reason}",
],
)
Expand All @@ -58,23 +57,51 @@ async def a_measure(self, test_case: LLMTestCase) -> float:
return self.measure(test_case)

def _generate_reason(self):
reason = f"The score is {self.score} because {len(self.expected_tools_used)} out of {len(self.expected_tools)} expected tools were used. "
tools_unused = list(self.expected_tools - self.expected_tools_used)
if len(tools_unused) > 0:
reason += f""
reason += (
f"Tool {tools_unused} was "
if len(tools_unused) == 1
else f"Tools {tools_unused} were "
if self.should_exact_match:
return f"{'Exact match' if self.tools_used == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_used}."

elif self.should_consider_ordering:
lcs = get_lcs(self.expected_tools, self.tools_used)
missing = set(self.expected_tools) - set(self.tools_used)
out_of_order = set(self.expected_tools) - set(lcs)

if len(lcs) == len(self.expected_tools):
return f"Correct ordering: all expected tools {self.expected_tools} were called in the correct order."
else:
issues = []
if missing:
issues.append(f"missing tools {list(missing)}")
if out_of_order:
issues.append(f"out-of-order tools {list(out_of_order)}")
return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_used}."

else:
used_expected = set(self.tools_used).intersection(
set(self.expected_tools)
)
reason += "expected but not used."
missing = set(self.expected_tools) - used_expected

return reason
if len(used_expected) == len(self.expected_tools):
return f"All expected tools {self.expected_tools} were called (order not considered)."
else:
return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_used}."

def _calculate_score(self):
number_of_expected_tools_used = len(self.expected_tools_used)
number_of_expected_tools = len(self.expected_tools)
score = number_of_expected_tools_used / number_of_expected_tools
if self.should_exact_match:
return 1.0 if self.tools_used == self.expected_tools else 0.0

elif self.should_consider_ordering:
longest_common_subsequence = get_lcs(
self.expected_tools, self.tools_used
)
score = len(longest_common_subsequence) / len(self.expected_tools)

else:
used_expected_tools = set(self.tools_used).intersection(
set(self.expected_tools)
)
score = len(used_expected_tools) / len(self.expected_tools)

return 0 if self.strict_mode and score < self.threshold else score

def is_successful(self) -> bool:
Expand Down
27 changes: 27 additions & 0 deletions deepeval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,33 @@
from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER


def get_lcs(seq1, seq2):
m, n = len(seq1), len(seq2)
dp = [[0] * (n + 1) for _ in range(m + 1)]

for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i - 1] == seq2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

# Reconstruct the LCS
lcs = []
i, j = m, n
while i > 0 and j > 0:
if seq1[i - 1] == seq2[j - 1]:
lcs.append(seq1[i - 1])
i -= 1
j -= 1
elif dp[i - 1][j] > dp[i][j - 1]:
i -= 1
else:
j -= 1

return lcs[::-1]


def camel_to_snake(name: str) -> str:
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
Expand Down
4 changes: 2 additions & 2 deletions docs/docs/metrics-introduction.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ In `deepeval`, a metric serves as a standard of measurement for evaluating the p

`deepeval` also offers conversational metrics, which are metrics used to evaluate conversations instead of individual, granular LLM interactions. These include:

- Knowledge Retention
- Conversation Completeness
- Conversation Relevancy
- Conversation Coherence
- Knowledge Retention

You can also easily develop your own custom evaluation metrics in `deepeval`. All metrics are measured on a test case. Visit the [test cases section](evaluation-test-cases) to learn how to apply any metric on test cases for evaluation.

Expand Down
12 changes: 9 additions & 3 deletions docs/docs/metrics-tool-correctness.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ from deepeval.test_case import LLMTestCase
metric = ToolCorrectnessMetric()
test_case = LLMTestCase(
input="What if these shoes don't fit?",
actual_output="We offer a 30-day full refund at no extra cost."
actual_output="We offer a 30-day full refund at no extra cost.",
# Replace this with the tools that was actually used by your LLM agent
tools_used=["WebSearch"]
tools_used=["WebSearch"],
expected_tools=["WebSearch", "ToolQuery"]
)

Expand All @@ -48,6 +48,12 @@ There are four optional parameters when creating a `ToolCorrectnessMetric`:
- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`.
- [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`.
- [Optional] `should_consider_ordering`: a boolean which when set to `True`, will consider the ordering in which the tools were called in. For example, if `expected_tools=["WebSearch", "ToolQuery", "WebSearch"]` and `tools_used=["WebSearch", "WebSearch"]`, the metric will consider the tool calling to be correct. Defaulted to `False`.
- [Optional] `should_exact_match`: a boolean which when set to `True`, will required the `tools_used` and `expected_tools` to be exactly the same. Defaulted to `False`.

:::note
Since `should_exact_match` is a stricter criteria than `should_consider_ordering`, setting `should_consider_ordering` will have no effect when `should_exact_match` is set to `True`.
:::

## How Is It Calculated?

Expand All @@ -62,4 +68,4 @@ The **tool correctness metric** score is calculated according to the following e
"
/>

This metric assesses the accuracy of your agent's tool usage by comparing the `tools_used` by your LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent matches the expected tools, while a score of 0 signifies that none of the used tools were among the expected tools.
This metric assesses the accuracy of your agent's tool usage by comparing the `tools_used` by your LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent were called correctly according to the list of `expected_tools`, `should_consider_ordering`, and `should_exact_match`, while a score of 0 signifies that none of the `tools_used` were called correctly.

0 comments on commit fe2f6f1

Please sign in to comment.