rename

confident-ai · Aug 30, 2024 · 3f64aa6 · 3f64aa6
1 parent be310d5
commit 3f64aa6
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 43 deletions.
diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py
@@ -146,7 +146,7 @@ def create_api_test_case(
             expectedOutput=test_case.expected_output,
             context=test_case.context,
             retrievalContext=test_case.retrieval_context,
-            toolsUsed=test_case.tools_used,
+            toolsCalled=test_case.tools_called,
             expectedTools=test_case.expected_tools,
             success=success,
             metricsData=None,

diff --git a/deepeval/metrics/tool_correctness/tool_correctness.py b/deepeval/metrics/tool_correctness/tool_correctness.py
@@ -15,7 +15,7 @@
 required_params: List[LLMTestCaseParams] = [
     LLMTestCaseParams.INPUT,
     LLMTestCaseParams.ACTUAL_OUTPUT,
-    LLMTestCaseParams.TOOLS_USED,
+    LLMTestCaseParams.TOOLS_CALLED,
     LLMTestCaseParams.EXPECTED_TOOLS,
 ]
 
@@ -42,7 +42,7 @@ def measure(
     ) -> float:
         with metric_progress_indicator(self, _show_indicator=_show_indicator):
             check_llm_test_case_params(test_case, required_params, self)
-            self.tools_used: List[str] = test_case.tools_used
+            self.tools_called: List[str] = test_case.tools_called
             self.expected_tools: List[str] = test_case.expected_tools
             self.score = self._calculate_score()
             self.reason = self._generate_reason()
@@ -51,7 +51,7 @@ def measure(
                 self,
                 steps=[
                     f"Expected Tools:\n{prettify_list(self.expected_tools)}",
-                    f"Tools Used:\n{prettify_list(self.tools_used)}",
+                    f"Tools Called:\n{prettify_list(self.tools_called)}",
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
@@ -64,11 +64,11 @@ async def a_measure(
 
     def _generate_reason(self):
         if self.should_exact_match:
-            return f"{'Exact match' if self.tools_used == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_used}."
+            return f"{'Exact match' if self.tools_called == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_called}."
 
         elif self.should_consider_ordering:
-            lcs = get_lcs(self.expected_tools, self.tools_used)
-            missing = set(self.expected_tools) - set(self.tools_used)
+            lcs = get_lcs(self.expected_tools, self.tools_called)
+            missing = set(self.expected_tools) - set(self.tools_called)
             out_of_order = set(self.expected_tools) - set(lcs)
 
             if len(lcs) == len(self.expected_tools):
@@ -79,31 +79,31 @@ def _generate_reason(self):
                     issues.append(f"missing tools {list(missing)}")
                 if out_of_order:
                     issues.append(f"out-of-order tools {list(out_of_order)}")
-                return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_used}."
+                return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_called}."
 
         else:
-            used_expected = set(self.tools_used).intersection(
+            used_expected = set(self.tools_called).intersection(
                 set(self.expected_tools)
             )
             missing = set(self.expected_tools) - used_expected
 
             if len(used_expected) == len(self.expected_tools):
                 return f"All expected tools {self.expected_tools} were called (order not considered)."
             else:
-                return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_used}."
+                return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_called}."
 
     def _calculate_score(self):
         if self.should_exact_match:
-            return 1.0 if self.tools_used == self.expected_tools else 0.0
+            return 1.0 if self.tools_called == self.expected_tools else 0.0
 
         elif self.should_consider_ordering:
             longest_common_subsequence = get_lcs(
-                self.expected_tools, self.tools_used
+                self.expected_tools, self.tools_called
             )
             score = len(longest_common_subsequence) / len(self.expected_tools)
 
         else:
-            used_expected_tools = set(self.tools_used).intersection(
+            used_expected_tools = set(self.tools_called).intersection(
                 set(self.expected_tools)
             )
             score = len(used_expected_tools) / len(self.expected_tools)

diff --git a/deepeval/test_case/llm_test_case.py b/deepeval/test_case/llm_test_case.py
@@ -10,7 +10,7 @@ class LLMTestCaseParams(Enum):
     EXPECTED_OUTPUT = "expected_output"
     CONTEXT = "context"
     RETRIEVAL_CONTEXT = "retrieval_context"
-    TOOLS_USED = "tools_used"
+    TOOLS_CALLED = "tools_called"
     EXPECTED_TOOLS = "expected_tools"
     REASONING = "reasoning"
 
@@ -24,7 +24,7 @@ class LLMTestCase:
     retrieval_context: Optional[List[str]] = None
     additional_metadata: Optional[Dict] = None
     comments: Optional[str] = None
-    tools_used: Optional[List[str]] = None
+    tools_called: Optional[List[str]] = None
     expected_tools: Optional[List[str]] = None
     reasoning: Optional[str] = None
     name: Optional[str] = field(default=None)
@@ -49,13 +49,13 @@ def __post_init__(self):
                     "'retrieval_context' must be None or a list of strings"
                 )
 
-        # Ensure `tools_used` is None or a list of strings
-        if self.tools_used is not None:
-            if not isinstance(self.tools_used, list) or not all(
-                isinstance(item, str) for item in self.tools_used
+        # Ensure `tools_called` is None or a list of strings
+        if self.tools_called is not None:
+            if not isinstance(self.tools_called, list) or not all(
+                isinstance(item, str) for item in self.tools_called
             ):
                 raise TypeError(
-                    "'tools_used' must be None or a list of strings"
+                    "'tools_called' must be None or a list of strings"
                 )
 
         # Ensure `expected_tools` is None or a list of strings

diff --git a/deepeval/test_run/api.py b/deepeval/test_run/api.py
@@ -22,7 +22,7 @@ class LLMApiTestCase(BaseModel):
     expected_output: Optional[str] = Field(None, alias="expectedOutput")
     context: Optional[list] = Field(None)
     retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
-    tools_used: Optional[list] = Field(None, alias="toolsUsed")
+    tools_called: Optional[list] = Field(None, alias="toolsCalled")
     expected_tools: Optional[list] = Field(None, alias="expectedTools")
     # make optional, not all test cases in a conversation will be evaluated
     success: Union[bool, None] = Field(None)

diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx
@@ -19,7 +19,7 @@ While a `ConversationalTestCase` is a list of messages represented by `LLMTestCa
 - [Optional] `expected_output`
 - [Optional] `context`
 - [Optional] `retrieval_context`
-- [Optional] `tools_used`
+- [Optional] `tools_called`
 - [Optional] `expected_tools`
 
 Here's an example implementation of a test case:
@@ -33,15 +33,15 @@ test_case = LLMTestCase(
     actual_output="We offer a 30-day full refund at no extra cost.",
     context=["All customers are eligible for a 30 day full refund at no extra cost."],
     retrieval_context=["Only shoes can be refunded."],
-    tools_used=["WebSearch"],
+    tools_called=["WebSearch"],
     expected_tools=["WebSearch", "QueryDatabase"]
 )
 ```
 
 :::info
 Since `deepeval` is an LLM evaluation framework, the ** `input` and `actual_output` are always mandatory.** However, this does not mean they are necessarily used for evaluation.
 
-Additionally, depending on the specific metric you're evaluating your test cases on, you may or may not require a `retrieval_context`, `expected_output`, `context`, `tools_used`, and/or `expected_tools` as additional parameters. For example, you won't need `expected_output`, `context`, `tools_used`, and `expected_tools` if you're just measuring answer relevancy, but if you're evaluating hallucination you'll have to provide `context` in order for `deepeval` to know what the **ground truth** is.
+Additionally, depending on the specific metric you're evaluating your test cases on, you may or may not require a `retrieval_context`, `expected_output`, `context`, `tools_called`, and/or `expected_tools` as additional parameters. For example, you won't need `expected_output`, `context`, `tools_called`, and `expected_tools` if you're just measuring answer relevancy, but if you're evaluating hallucination you'll have to provide `context` in order for `deepeval` to know what the **ground truth** is.
 :::
 
 ## LLM Test Case
@@ -193,9 +193,9 @@ test_case = LLMTestCase(
 Remember, `context` is the ideal retrieval results for a given input and typically come from your evaluation dataset, whereas `retrieval_context` is your LLM application's actual retrieval results. So, while they might look similar at times, they are not the same.
 :::
 
-### Tools Used
+### Tools Called
 
-The `tools_used` parameter is an **optional** parameter that represents the tools your LLM agent actually invoked during execution. By providing `tools_used`, you can evaluate how effectively your LLM agent utilized the tools available to it.
+The `tools_called` parameter is an **optional** parameter that represents the tools your LLM agent actually invoked during execution. By providing `tools_called`, you can evaluate how effectively your LLM agent utilized the tools available to it.
 
 ```python
 # A hypothetical LLM application example
@@ -206,12 +206,12 @@ test_case = LLMTestCase(
     input="Why did the chicken cross the road?",
     actual_output=chatbot.run(input),
     # Replace this with the tools that were actually used
-    tools_used=["WebSearch", "DatabaseQuery"]
+    tools_called=["WebSearch", "DatabaseQuery"]
 )
 ```
 
 :::note
-`tools_used` and `expected_tools` are LLM test case parameters that are utilized only in **agentic evaluation metrics**. These parameters allow you to assess the [tool usage correctness](metrics-tool-correctness) of your LLM application and ensure that it meets the expected tool usage standards.
+`tools_called` and `expected_tools` are LLM test case parameters that are utilized only in **agentic evaluation metrics**. These parameters allow you to assess the [tool usage correctness](metrics-tool-correctness) of your LLM application and ensure that it meets the expected tool usage standards.
 :::
 
 ### Expected Tools
@@ -228,7 +228,7 @@ test_case = LLMTestCase(
     input=input,
     actual_output=chatbot.run(input),
     # Replace this with the tools that were actually used
-    tools_used=["WebSearch", "DatabaseQuery"],
+    tools_called=["WebSearch", "DatabaseQuery"],
     expected_tools=["DatabaseQuery"]
 )
 ```

diff --git a/docs/docs/metrics-tool-correctness.mdx b/docs/docs/metrics-tool-correctness.mdx
@@ -18,7 +18,7 @@ To use the `ToolCorrectnessMetric`, you'll have to provide the following argumen
 
 - `input`
 - `actual_output`
-- `tools_used`
+- `tools_called`
 - `expected_tools`
 
 ## Example
@@ -33,7 +33,7 @@ test_case = LLMTestCase(
     input="What if these shoes don't fit?",
     actual_output="We offer a 30-day full refund at no extra cost.",
     # Replace this with the tools that was actually used by your LLM agent
-    tools_used=["WebSearch"],
+    tools_called=["WebSearch"],
     expected_tools=["WebSearch", "ToolQuery"]
 )
 
@@ -48,8 +48,8 @@ There are four optional parameters when creating a `ToolCorrectnessMetric`:
 - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
 - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`.
 - [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`.
-- [Optional] `should_consider_ordering`: a boolean which when set to `True`, will consider the ordering in which the tools were called in. For example, if `expected_tools=["WebSearch", "ToolQuery", "WebSearch"]` and `tools_used=["WebSearch", "WebSearch"]`, the metric will consider the tool calling to be correct. Defaulted to `False`.
-- [Optional] `should_exact_match`: a boolean which when set to `True`, will required the `tools_used` and `expected_tools` to be exactly the same. Defaulted to `False`.
+- [Optional] `should_consider_ordering`: a boolean which when set to `True`, will consider the ordering in which the tools were called in. For example, if `expected_tools=["WebSearch", "ToolQuery", "WebSearch"]` and `tools_called=["WebSearch", "WebSearch"]`, the metric will consider the tool calling to be correct. Defaulted to `False`.
+- [Optional] `should_exact_match`: a boolean which when set to `True`, will required the `tools_called` and `expected_tools` to be exactly the same. Defaulted to `False`.
 
 :::note
 Since `should_exact_match` is a stricter criteria than `should_consider_ordering`, setting `should_consider_ordering` will have no effect when `should_exact_match` is set to `True`.
@@ -58,14 +58,14 @@ Since `should_exact_match` is a stricter criteria than `should_consider_ordering
 ## How Is It Calculated?
 
 :::note
-The `ToolCorrectnessMetric`, unlike all other `deepeval` metrics, are not calculated using any models or LLMs, and instead via exact matching between the `expected_tools` and `tools_used` parameters.
+The `ToolCorrectnessMetric`, unlike all other `deepeval` metrics, are not calculated using any models or LLMs, and instead via exact matching between the `expected_tools` and `tools_called` parameters.
 :::
 
 The **tool correctness metric** score is calculated according to the following equation:
 
 <Equation
-  formula="\text{Tool Correctness} = \frac{\text{Number of Correctly Used Tools}}{\text{Total Number of Tools Used}}
+  formula="\text{Tool Correctness} = \frac{\text{Number of Correctly Used Tools}}{\text{Total Number of Tools Called}}
 "
 />
 
-This metric assesses the accuracy of your agent's tool usage by comparing the `tools_used` by your LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent were called correctly according to the list of `expected_tools`, `should_consider_ordering`, and `should_exact_match`, while a score of 0 signifies that none of the `tools_used` were called correctly.
+This metric assesses the accuracy of your agent's tool usage by comparing the `tools_called` by your LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent were called correctly according to the list of `expected_tools`, `should_consider_ordering`, and `should_exact_match`, while a score of 0 signifies that none of the `tools_called` were called correctly.
diff --git a/tests/test_everything.py b/tests/test_everything.py
@@ -152,7 +152,8 @@ def test_everything():
         expected_output="this is a mocha",
         retrieval_context=["I love coffee"],
         context=["I love coffee"],
-        name="okwhatever",
+        tools_called=["ok"],
+        expected_tools=["ok", "ok"],
     )
     c_test_case = ConversationalTestCase(
         messages=[
@@ -163,19 +164,19 @@ def test_everything():
     assert_test(
         test_case,
         [
-            metric1,
-            metric2,
+            # metric1,
+            # metric2,
             # metric3,
             # metric4,
             # metric5,
             # metric6,
             # metric7,
             # metric8,
             # metric9,
-            metric10,
+            # metric10,
             # metric11,
-            metric12,
-            # metric13,
+            # metric12,
+            metric13,
         ],
         run_async=True,
     )
@@ -215,7 +216,7 @@ def test_everything_2():
         # retrieval_context=["I love coffee"],
         context=["I love coffee"],
         expected_tools=["mixer", "creamer", "dripper"],
-        tools_used=["mixer", "creamer", "mixer"],
+        tools_called=["mixer", "creamer", "mixer"],
     )
     assert_test(
         test_case,