Skip to content

Commit

Permalink
rename
Browse files Browse the repository at this point in the history
  • Loading branch information
penguine-ip committed Aug 30, 2024
1 parent be310d5 commit 3f64aa6
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 43 deletions.
2 changes: 1 addition & 1 deletion deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def create_api_test_case(
expectedOutput=test_case.expected_output,
context=test_case.context,
retrievalContext=test_case.retrieval_context,
toolsUsed=test_case.tools_used,
toolsCalled=test_case.tools_called,
expectedTools=test_case.expected_tools,
success=success,
metricsData=None,
Expand Down
24 changes: 12 additions & 12 deletions deepeval/metrics/tool_correctness/tool_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.TOOLS_USED,
LLMTestCaseParams.TOOLS_CALLED,
LLMTestCaseParams.EXPECTED_TOOLS,
]

Expand All @@ -42,7 +42,7 @@ def measure(
) -> float:
with metric_progress_indicator(self, _show_indicator=_show_indicator):
check_llm_test_case_params(test_case, required_params, self)
self.tools_used: List[str] = test_case.tools_used
self.tools_called: List[str] = test_case.tools_called
self.expected_tools: List[str] = test_case.expected_tools
self.score = self._calculate_score()
self.reason = self._generate_reason()
Expand All @@ -51,7 +51,7 @@ def measure(
self,
steps=[
f"Expected Tools:\n{prettify_list(self.expected_tools)}",
f"Tools Used:\n{prettify_list(self.tools_used)}",
f"Tools Called:\n{prettify_list(self.tools_called)}",
f"Score: {self.score}\nReason: {self.reason}",
],
)
Expand All @@ -64,11 +64,11 @@ async def a_measure(

def _generate_reason(self):
if self.should_exact_match:
return f"{'Exact match' if self.tools_used == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_used}."
return f"{'Exact match' if self.tools_called == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_called}."

elif self.should_consider_ordering:
lcs = get_lcs(self.expected_tools, self.tools_used)
missing = set(self.expected_tools) - set(self.tools_used)
lcs = get_lcs(self.expected_tools, self.tools_called)
missing = set(self.expected_tools) - set(self.tools_called)
out_of_order = set(self.expected_tools) - set(lcs)

if len(lcs) == len(self.expected_tools):
Expand All @@ -79,31 +79,31 @@ def _generate_reason(self):
issues.append(f"missing tools {list(missing)}")
if out_of_order:
issues.append(f"out-of-order tools {list(out_of_order)}")
return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_used}."
return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_called}."

else:
used_expected = set(self.tools_used).intersection(
used_expected = set(self.tools_called).intersection(
set(self.expected_tools)
)
missing = set(self.expected_tools) - used_expected

if len(used_expected) == len(self.expected_tools):
return f"All expected tools {self.expected_tools} were called (order not considered)."
else:
return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_used}."
return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_called}."

def _calculate_score(self):
if self.should_exact_match:
return 1.0 if self.tools_used == self.expected_tools else 0.0
return 1.0 if self.tools_called == self.expected_tools else 0.0

elif self.should_consider_ordering:
longest_common_subsequence = get_lcs(
self.expected_tools, self.tools_used
self.expected_tools, self.tools_called
)
score = len(longest_common_subsequence) / len(self.expected_tools)

else:
used_expected_tools = set(self.tools_used).intersection(
used_expected_tools = set(self.tools_called).intersection(
set(self.expected_tools)
)
score = len(used_expected_tools) / len(self.expected_tools)
Expand Down
14 changes: 7 additions & 7 deletions deepeval/test_case/llm_test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class LLMTestCaseParams(Enum):
EXPECTED_OUTPUT = "expected_output"
CONTEXT = "context"
RETRIEVAL_CONTEXT = "retrieval_context"
TOOLS_USED = "tools_used"
TOOLS_CALLED = "tools_called"
EXPECTED_TOOLS = "expected_tools"
REASONING = "reasoning"

Expand All @@ -24,7 +24,7 @@ class LLMTestCase:
retrieval_context: Optional[List[str]] = None
additional_metadata: Optional[Dict] = None
comments: Optional[str] = None
tools_used: Optional[List[str]] = None
tools_called: Optional[List[str]] = None
expected_tools: Optional[List[str]] = None
reasoning: Optional[str] = None
name: Optional[str] = field(default=None)
Expand All @@ -49,13 +49,13 @@ def __post_init__(self):
"'retrieval_context' must be None or a list of strings"
)

# Ensure `tools_used` is None or a list of strings
if self.tools_used is not None:
if not isinstance(self.tools_used, list) or not all(
isinstance(item, str) for item in self.tools_used
# Ensure `tools_called` is None or a list of strings
if self.tools_called is not None:
if not isinstance(self.tools_called, list) or not all(
isinstance(item, str) for item in self.tools_called
):
raise TypeError(
"'tools_used' must be None or a list of strings"
"'tools_called' must be None or a list of strings"
)

# Ensure `expected_tools` is None or a list of strings
Expand Down
2 changes: 1 addition & 1 deletion deepeval/test_run/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class LLMApiTestCase(BaseModel):
expected_output: Optional[str] = Field(None, alias="expectedOutput")
context: Optional[list] = Field(None)
retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
tools_used: Optional[list] = Field(None, alias="toolsUsed")
tools_called: Optional[list] = Field(None, alias="toolsCalled")
expected_tools: Optional[list] = Field(None, alias="expectedTools")
# make optional, not all test cases in a conversation will be evaluated
success: Union[bool, None] = Field(None)
Expand Down
16 changes: 8 additions & 8 deletions docs/docs/evaluation-test-cases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ While a `ConversationalTestCase` is a list of messages represented by `LLMTestCa
- [Optional] `expected_output`
- [Optional] `context`
- [Optional] `retrieval_context`
- [Optional] `tools_used`
- [Optional] `tools_called`
- [Optional] `expected_tools`

Here's an example implementation of a test case:
Expand All @@ -33,15 +33,15 @@ test_case = LLMTestCase(
actual_output="We offer a 30-day full refund at no extra cost.",
context=["All customers are eligible for a 30 day full refund at no extra cost."],
retrieval_context=["Only shoes can be refunded."],
tools_used=["WebSearch"],
tools_called=["WebSearch"],
expected_tools=["WebSearch", "QueryDatabase"]
)
```

:::info
Since `deepeval` is an LLM evaluation framework, the ** `input` and `actual_output` are always mandatory.** However, this does not mean they are necessarily used for evaluation.

Additionally, depending on the specific metric you're evaluating your test cases on, you may or may not require a `retrieval_context`, `expected_output`, `context`, `tools_used`, and/or `expected_tools` as additional parameters. For example, you won't need `expected_output`, `context`, `tools_used`, and `expected_tools` if you're just measuring answer relevancy, but if you're evaluating hallucination you'll have to provide `context` in order for `deepeval` to know what the **ground truth** is.
Additionally, depending on the specific metric you're evaluating your test cases on, you may or may not require a `retrieval_context`, `expected_output`, `context`, `tools_called`, and/or `expected_tools` as additional parameters. For example, you won't need `expected_output`, `context`, `tools_called`, and `expected_tools` if you're just measuring answer relevancy, but if you're evaluating hallucination you'll have to provide `context` in order for `deepeval` to know what the **ground truth** is.
:::

## LLM Test Case
Expand Down Expand Up @@ -193,9 +193,9 @@ test_case = LLMTestCase(
Remember, `context` is the ideal retrieval results for a given input and typically come from your evaluation dataset, whereas `retrieval_context` is your LLM application's actual retrieval results. So, while they might look similar at times, they are not the same.
:::

### Tools Used
### Tools Called

The `tools_used` parameter is an **optional** parameter that represents the tools your LLM agent actually invoked during execution. By providing `tools_used`, you can evaluate how effectively your LLM agent utilized the tools available to it.
The `tools_called` parameter is an **optional** parameter that represents the tools your LLM agent actually invoked during execution. By providing `tools_called`, you can evaluate how effectively your LLM agent utilized the tools available to it.

```python
# A hypothetical LLM application example
Expand All @@ -206,12 +206,12 @@ test_case = LLMTestCase(
input="Why did the chicken cross the road?",
actual_output=chatbot.run(input),
# Replace this with the tools that were actually used
tools_used=["WebSearch", "DatabaseQuery"]
tools_called=["WebSearch", "DatabaseQuery"]
)
```

:::note
`tools_used` and `expected_tools` are LLM test case parameters that are utilized only in **agentic evaluation metrics**. These parameters allow you to assess the [tool usage correctness](metrics-tool-correctness) of your LLM application and ensure that it meets the expected tool usage standards.
`tools_called` and `expected_tools` are LLM test case parameters that are utilized only in **agentic evaluation metrics**. These parameters allow you to assess the [tool usage correctness](metrics-tool-correctness) of your LLM application and ensure that it meets the expected tool usage standards.
:::

### Expected Tools
Expand All @@ -228,7 +228,7 @@ test_case = LLMTestCase(
input=input,
actual_output=chatbot.run(input),
# Replace this with the tools that were actually used
tools_used=["WebSearch", "DatabaseQuery"],
tools_called=["WebSearch", "DatabaseQuery"],
expected_tools=["DatabaseQuery"]
)
```
Expand Down
14 changes: 7 additions & 7 deletions docs/docs/metrics-tool-correctness.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ To use the `ToolCorrectnessMetric`, you'll have to provide the following argumen

- `input`
- `actual_output`
- `tools_used`
- `tools_called`
- `expected_tools`

## Example
Expand All @@ -33,7 +33,7 @@ test_case = LLMTestCase(
input="What if these shoes don't fit?",
actual_output="We offer a 30-day full refund at no extra cost.",
# Replace this with the tools that was actually used by your LLM agent
tools_used=["WebSearch"],
tools_called=["WebSearch"],
expected_tools=["WebSearch", "ToolQuery"]
)

Expand All @@ -48,8 +48,8 @@ There are four optional parameters when creating a `ToolCorrectnessMetric`:
- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`.
- [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`.
- [Optional] `should_consider_ordering`: a boolean which when set to `True`, will consider the ordering in which the tools were called in. For example, if `expected_tools=["WebSearch", "ToolQuery", "WebSearch"]` and `tools_used=["WebSearch", "WebSearch"]`, the metric will consider the tool calling to be correct. Defaulted to `False`.
- [Optional] `should_exact_match`: a boolean which when set to `True`, will required the `tools_used` and `expected_tools` to be exactly the same. Defaulted to `False`.
- [Optional] `should_consider_ordering`: a boolean which when set to `True`, will consider the ordering in which the tools were called in. For example, if `expected_tools=["WebSearch", "ToolQuery", "WebSearch"]` and `tools_called=["WebSearch", "WebSearch"]`, the metric will consider the tool calling to be correct. Defaulted to `False`.
- [Optional] `should_exact_match`: a boolean which when set to `True`, will required the `tools_called` and `expected_tools` to be exactly the same. Defaulted to `False`.

:::note
Since `should_exact_match` is a stricter criteria than `should_consider_ordering`, setting `should_consider_ordering` will have no effect when `should_exact_match` is set to `True`.
Expand All @@ -58,14 +58,14 @@ Since `should_exact_match` is a stricter criteria than `should_consider_ordering
## How Is It Calculated?

:::note
The `ToolCorrectnessMetric`, unlike all other `deepeval` metrics, are not calculated using any models or LLMs, and instead via exact matching between the `expected_tools` and `tools_used` parameters.
The `ToolCorrectnessMetric`, unlike all other `deepeval` metrics, are not calculated using any models or LLMs, and instead via exact matching between the `expected_tools` and `tools_called` parameters.
:::

The **tool correctness metric** score is calculated according to the following equation:

<Equation
formula="\text{Tool Correctness} = \frac{\text{Number of Correctly Used Tools}}{\text{Total Number of Tools Used}}
formula="\text{Tool Correctness} = \frac{\text{Number of Correctly Used Tools}}{\text{Total Number of Tools Called}}
"
/>

This metric assesses the accuracy of your agent's tool usage by comparing the `tools_used` by your LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent were called correctly according to the list of `expected_tools`, `should_consider_ordering`, and `should_exact_match`, while a score of 0 signifies that none of the `tools_used` were called correctly.
This metric assesses the accuracy of your agent's tool usage by comparing the `tools_called` by your LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent were called correctly according to the list of `expected_tools`, `should_consider_ordering`, and `should_exact_match`, while a score of 0 signifies that none of the `tools_called` were called correctly.
15 changes: 8 additions & 7 deletions tests/test_everything.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ def test_everything():
expected_output="this is a mocha",
retrieval_context=["I love coffee"],
context=["I love coffee"],
name="okwhatever",
tools_called=["ok"],
expected_tools=["ok", "ok"],
)
c_test_case = ConversationalTestCase(
messages=[
Expand All @@ -163,19 +164,19 @@ def test_everything():
assert_test(
test_case,
[
metric1,
metric2,
# metric1,
# metric2,
# metric3,
# metric4,
# metric5,
# metric6,
# metric7,
# metric8,
# metric9,
metric10,
# metric10,
# metric11,
metric12,
# metric13,
# metric12,
metric13,
],
run_async=True,
)
Expand Down Expand Up @@ -215,7 +216,7 @@ def test_everything_2():
# retrieval_context=["I love coffee"],
context=["I love coffee"],
expected_tools=["mixer", "creamer", "dripper"],
tools_used=["mixer", "creamer", "mixer"],
tools_called=["mixer", "creamer", "mixer"],
)
assert_test(
test_case,
Expand Down

0 comments on commit 3f64aa6

Please sign in to comment.