Update the evaluation method to pass the dataset item to the scoring …

…method (#698) * Updated evaluate function
comet-ml · Nov 22, 2024 · 399b5cc · 399b5cc
1 parent 66f0985
commit 399b5cc
Show file tree

Hide file tree

Showing 83 changed files with 2,496 additions and 16,720 deletions.
diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.ipynb
@@ -26,7 +26,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install opik pyarrow fsspec huggingface_hub --upgrade"
+    "%pip install opik pyarrow fsspec huggingface_hub --upgrade --quiet"
    ]
   },
   {
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,8 +98,8 @@
     "    {\n",
     "        \"input\": x[\"question\"],\n",
     "        \"context\": [x[\"passage\"]],\n",
-    "        \"output\": x[\"answer\"],\n",
-    "        \"expected_output\": x[\"label\"],\n",
+    "        \"llm_output\": x[\"answer\"],\n",
+    "        \"expected_hallucination_label\": x[\"label\"],\n",
     "    }\n",
     "    for x in df.to_dict(orient=\"records\")\n",
     "]\n",
@@ -139,7 +139,7 @@
     "    metric = Hallucination()\n",
     "    try:\n",
     "        metric_score = metric.score(\n",
-    "            input=x[\"input\"], context=x[\"context\"], output=x[\"output\"]\n",
+    "            input=x[\"input\"], context=x[\"context\"], output=x[\"llm_output\"]\n",
     "        )\n",
     "        hallucination_score = metric_score.value\n",
     "        hallucination_reason = metric_score.reason\n",
@@ -149,9 +149,8 @@
     "        hallucination_reason = str(e)\n",
     "\n",
     "    return {\n",
-    "        \"output\": \"FAIL\" if hallucination_score == 1 else \"PASS\",\n",
+    "        \"hallucination_score\": \"FAIL\" if hallucination_score == 1 else \"PASS\",\n",
     "        \"hallucination_reason\": hallucination_reason,\n",
-    "        \"reference\": x[\"expected_output\"],\n",
     "    }\n",
     "\n",
     "\n",
@@ -174,6 +173,10 @@
     "    task=evaluation_task,\n",
     "    scoring_metrics=[check_hallucinated_metric],\n",
     "    experiment_config=experiment_config,\n",
+    "    scoring_key_mapping={\n",
+    "        \"reference\": \"expected_hallucination_label\",\n",
+    "        \"output\": \"hallucination_score\",\n",
+    "    },\n",
     ")"
    ]
   },

diff --git a/...opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.md b/...opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.md
@@ -10,7 +10,7 @@ For this guide we will be evaluating the Hallucination metric included in the LL
 
 
 ```python
-%pip install opik pyarrow fsspec huggingface_hub --upgrade
+%pip install opik pyarrow fsspec huggingface_hub --upgrade --quiet
 ```
 
 
@@ -60,8 +60,8 @@ dataset_records = [
     {
         "input": x["question"],
         "context": [x["passage"]],
-        "output": x["answer"],
-        "expected_output": x["label"],
+        "llm_output": x["answer"],
+        "expected_hallucination_label": x["label"],
     }
     for x in df.to_dict(orient="records")
 ]
@@ -92,7 +92,7 @@ def evaluation_task(x: Dict):
     metric = Hallucination()
     try:
         metric_score = metric.score(
-            input=x["input"], context=x["context"], output=x["output"]
+            input=x["input"], context=x["context"], output=x["llm_output"]
         )
         hallucination_score = metric_score.value
         hallucination_reason = metric_score.reason
@@ -102,9 +102,8 @@ def evaluation_task(x: Dict):
         hallucination_reason = str(e)
 
     return {
-        "output": "FAIL" if hallucination_score == 1 else "PASS",
-        "hallucination_reason": hallucination_reason,
-        "reference": x["expected_output"],
+        "hallucination_score": "FAIL" if hallucination_score == 1 else "PASS",
+        "hallucination_reason": hallucination_reason
     }
 
 
@@ -127,6 +126,7 @@ res = evaluate(
     task=evaluation_task,
     scoring_metrics=[check_hallucinated_metric],
     experiment_config=experiment_config,
+    scoring_key_mapping={"reference": "expected_hallucination_label", "output": "hallucination_score"},
 )
 ```
 

diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -73,7 +73,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -142,6 +142,7 @@
     "from opik.evaluation.metrics.llm_judges.moderation.template import generate_query\n",
     "from typing import Dict\n",
     "\n",
+    "\n",
     "# Define the evaluation task\n",
     "def evaluation_task(x: Dict):\n",
     "    metric = Moderation()\n",
@@ -157,10 +158,8 @@
     "    moderation_score = \"moderated\" if metric_score.value > 0.5 else \"not_moderated\"\n",
     "\n",
     "    return {\n",
-    "        \"output\": moderation_score,\n",
     "        \"moderation_score\": moderation_score,\n",
     "        \"moderation_reason\": moderation_reason,\n",
-    "        \"reference\": x[\"expected_output\"],\n",
     "    }\n",
     "\n",
     "\n",
@@ -181,6 +180,7 @@
     "    task=evaluation_task,\n",
     "    scoring_metrics=[moderation_metric],\n",
     "    experiment_config=experiment_config,\n",
+    "    scoring_key_mapping={\"reference\": \"expected_output\", \"output\": \"moderation_score\"},\n",
     ")"
    ]
   },

diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.md b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.md
@@ -110,10 +110,8 @@ def evaluation_task(x: Dict):
     moderation_score = "moderated" if metric_score.value > 0.5 else "not_moderated"
 
     return {
-        "output": moderation_score,
         "moderation_score": moderation_score,
         "moderation_reason": moderation_reason,
-        "reference": x["expected_output"],
     }
 
 
@@ -134,6 +132,7 @@ res = evaluate(
     task=evaluation_task,
     scoring_metrics=[moderation_metric],
     experiment_config=experiment_config,
+    scoring_key_mapping={"reference": "expected_output", "output": "moderation_score"},
 )
 ```