Update to docs (#230)

comet-ml · Sep 12, 2024 · 2b90c0c · 2b90c0c
1 parent b93b8b8
commit 2b90c0c
Show file tree

Hide file tree

Showing 19 changed files with 457 additions and 196 deletions.
diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.ipynb
@@ -134,7 +134,12 @@
    "source": [
     "## Evaluating the hallucination metric\n",
     "\n",
-    "We can use the Opik SDK to compute a hallucination score for each item in the dataset:"
+    "In order to evaluate the performance of the Opik hallucination metric, we will define:\n",
+    "\n",
+    "- Evaluation task: Our evaluation task will use the data in the Dataset to return a hallucination score computed using the Opik hallucination metric.\n",
+    "- Scoring metric: We will use the `Equals` metric to check if the hallucination score computed matches the expected output.\n",
+    "\n",
+    "By defining the evaluation task in this way, we will be able to understand how well Opik's hallucination metric is able to detect hallucinations in the dataset."
    ]
   },
   {
@@ -143,26 +148,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from opik.evaluation.metrics import Hallucination\n",
+    "from opik.evaluation.metrics import Hallucination, Equals\n",
     "from opik.evaluation import evaluate\n",
-    "from opik.evaluation.metrics import base_metric, score_result\n",
     "from opik import Opik, DatasetItem\n",
-    "import pandas as pd\n",
-    "\n",
-    "client = Opik()\n",
-    "\n",
-    "class CheckHallucinated(base_metric.BaseMetric):\n",
-    "    def __init__(self, name: str):\n",
-    "        self.name = name\n",
-    "\n",
-    "    def score(self, hallucination_score, expected_hallucination_score, **kwargs):\n",
-    "        return score_result.ScoreResult(\n",
-    "            value= None if hallucination_score is None else hallucination_score == expected_hallucination_score,\n",
-    "            name=self.name,\n",
-    "            reason=f\"Got the hallucination score of {hallucination_score} and expected {expected_hallucination_score}\",\n",
-    "            scoring_failed=hallucination_score is None\n",
-    "        )\n",
     "\n",
+    "# Define the evaluation task\n",
     "def evaluation_task(x: DatasetItem):\n",
     "    metric = Hallucination()\n",
     "    try:\n",
@@ -179,18 +169,23 @@
     "        hallucination_reason = str(e)\n",
     "    \n",
     "    return {\n",
-    "        \"hallucination_score\": \"FAIL\" if hallucination_score == 1 else \"PASS\",\n",
+    "        \"output\": \"FAIL\" if hallucination_score == 1 else \"PASS\",\n",
     "        \"hallucination_reason\": hallucination_reason,\n",
-    "        \"expected_hallucination_score\": x.expected_output[\"expected_output\"]\n",
+    "        \"reference\": x.expected_output[\"expected_output\"]\n",
     "    }\n",
     "\n",
+    "# Get the dataset\n",
+    "client = Opik()\n",
     "dataset = client.get_dataset(name=\"HaluBench\")\n",
     "\n",
+    "# Define the scoring metric\n",
+    "check_hallucinated_metric = Equals(name=\"Correct hallucination score\")\n",
+    "\n",
     "res = evaluate(\n",
-    "    experiment_name=\"Check Comet Metric\",\n",
+    "    experiment_name=\"Evaluate Opik hallucination metric\",\n",
     "    dataset=dataset,\n",
     "    task=evaluation_task,\n",
-    "    scoring_metrics=[CheckHallucinated(name=\"Detected hallucination\")]\n",
+    "    scoring_metrics=[check_hallucinated_metric]\n",
     ")"
    ]
   },

diff --git a/...opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.md b/...opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.md
@@ -13,8 +13,10 @@ For this guide we will be evaluating the Hallucination metric included in the LL
 import os
 import getpass
 
-os.environ["OPIK_API_KEY"] = getpass.getpass("Opik API Key: ")
-os.environ["OPIK_WORKSPACE"] = input("Comet workspace (often the same as your username): ")
+if "OPIK_API_KEY" not in os.environ:
+    os.environ["OPIK_API_KEY"] = getpass.getpass("Opik API Key: ")
+if "OPIK_WORKSPACE" not in os.environ:
+    os.environ["OPIK_WORKSPACE"] = input("Comet workspace (often the same as your username): ")
 ```
 
 If you are running the Opik platform locally, simply set:
@@ -31,16 +33,16 @@ First, we will install the necessary libraries, configure the OpenAI API key and
 
 
 ```python
-%pip install pyarrow fsspec huggingface_hub --quiet
+%pip install opik pyarrow fsspec huggingface_hub --upgrade --quiet 
 ```
 
 
 ```python
-# Configure OpenAI
 import os
 import getpass
 
-os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API key: ")
+if "OPENAI_API_KEY" not in os.environ:
+    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
 ```
 
 We will be using the [HaluBench dataset](https://huggingface.co/datasets/PatronusAI/HaluBench?library=pandas) which according to this [paper](https://arxiv.org/pdf/2407.08488) GPT-4o detects 87.9% of hallucinations. The first step will be to create a dataset in the platform so we can keep track of the results of the evaluation.
@@ -59,7 +61,7 @@ try:
 
     # Insert items into dataset
     df = pd.read_parquet("hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet")
-    df = df.sample(n=500, random_state=42)
+    df = df.sample(n=50, random_state=42)
 
     dataset_records = [
         DatasetItem(
@@ -81,30 +83,20 @@ except Exception as e:
 
 ## Evaluating the hallucination metric
 
-We can use the Opik SDK to compute a hallucination score for each item in the dataset:
+In order to evaluate the performance of the Opik hallucination metric, we will define:
+
+- Evaluation task: Our evaluation task will use the data in the Dataset to return a hallucination score computed using the Opik hallucination metric.
+- Scoring metric: We will use the `Equals` metric to check if the hallucination score computed matches the expected output.
+
+By defining the evaluation task in this way, we will be able to understand how well Opik's hallucination metric is able to detect hallucinations in the dataset.
 
 
 ```python
-from opik.evaluation.metrics import Hallucination
+from opik.evaluation.metrics import Hallucination, Equals
 from opik.evaluation import evaluate
-from opik.evaluation.metrics import base_metric, score_result
 from opik import Opik, DatasetItem
-import pandas as pd
-
-client = Opik()
-
-class CheckHallucinated(base_metric.BaseMetric):
-    def __init__(self, name: str):
-        self.name = name
-
-    def score(self, hallucination_score, expected_hallucination_score, **kwargs):
-        return score_result.ScoreResult(
-            value= None if hallucination_score is None else hallucination_score == expected_hallucination_score,
-            name=self.name,
-            reason=f"Got the hallucination score of {hallucination_score} and expected {expected_hallucination_score}",
-            scoring_failed=hallucination_score is None
-        )
 
+# Define the evaluation task
 def evaluation_task(x: DatasetItem):
     metric = Hallucination()
     try:
@@ -121,23 +113,28 @@ def evaluation_task(x: DatasetItem):
         hallucination_reason = str(e)
 
     return {
-        "hallucination_score": "FAIL" if hallucination_score == 1 else "PASS",
+        "output": "FAIL" if hallucination_score == 1 else "PASS",
         "hallucination_reason": hallucination_reason,
-        "expected_hallucination_score": x.expected_output["expected_output"]
+        "reference": x.expected_output["expected_output"]
     }
 
+# Get the dataset
+client = Opik()
 dataset = client.get_dataset(name="HaluBench")
 
+# Define the scoring metric
+check_hallucinated_metric = Equals(name="Correct hallucination score")
+
 res = evaluate(
-    experiment_name="Check Comet Metric",
+    experiment_name="Evaluate Opik hallucination metric",
     dataset=dataset,
     task=evaluation_task,
-    scoring_metrics=[CheckHallucinated(name="Detected hallucination")]
+    scoring_metrics=[check_hallucinated_metric]
 )
 ```
 
 We can see that the hallucination metric is able to detect ~80% of the hallucinations contained in the dataset and we can see the specific items where hallucinations were not detected.
 
-![Hallucination Evaluation](/img/cookbook/hallucination_metric_cookbook.png)
+![Hallucination Evaluation](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/cookbook/hallucination_metric_cookbook.png)
 
 
diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -46,7 +46,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,16 +65,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
     "%pip install opik --upgrade --quiet"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -94,9 +102,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "status_code: 409, body: {'errors': ['Dataset already exists']}\n"
+     ]
+    }
+   ],
    "source": [
     "# Create dataset\n",
     "from opik import Opik, DatasetItem\n",
@@ -145,36 +161,75 @@
    "source": [
     "## Evaluating the moderation metric\n",
     "\n",
+    "In order to evaluate the performance of the Opik moderation metric, we will define:\n",
+    "\n",
+    "- Evaluation task: Our evaluation task will use the data in the Dataset to return a moderation score computed using the Opik moderation metric.\n",
+    "- Scoring metric: We will use the `Equals` metric to check if the moderation score computed matches the expected output.\n",
+    "\n",
+    "By defining the evaluation task in this way, we will be able to understand how well Opik's moderation metric is able to detect moderation violations in the dataset.\n",
+    "\n",
     "We can use the Opik SDK to compute a moderation score for each item in the dataset:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluation: 100%|██████████| 50/50 [00:06<00:00,  8.09it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭─ OpenAIModerationDataset (50 samples) ─╮\n",
+       "│                                        │\n",
+       "│ <span style=\"font-weight: bold\">Total time:       </span> 00:00:06            │\n",
+       "│ <span style=\"font-weight: bold\">Number of samples:</span> 50                  │\n",
+       "│                                        │\n",
+       "│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">Correct moderation score: 0.8400 (avg)</span> │\n",
+       "│                                        │\n",
+       "╰────────────────────────────────────────╯\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "╭─ OpenAIModerationDataset (50 samples) ─╮\n",
+       "│                                        │\n",
+       "│ \u001b[1mTotal time:       \u001b[0m 00:00:06            │\n",
+       "│ \u001b[1mNumber of samples:\u001b[0m 50                  │\n",
+       "│                                        │\n",
+       "│ \u001b[1;32mCorrect moderation score: 0.8400 (avg)\u001b[0m │\n",
+       "│                                        │\n",
+       "╰────────────────────────────────────────╯\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploading results to Opik <span style=\"color: #808000; text-decoration-color: #808000\">...</span> \n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Uploading results to Opik \u001b[33m...\u001b[0m \n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
-    "from opik.evaluation.metrics import Moderation\n",
+    "from opik.evaluation.metrics import Moderation, Equals\n",
     "from opik.evaluation import evaluate\n",
-    "from opik.evaluation.metrics import base_metric, score_result\n",
     "from opik import Opik, DatasetItem\n",
     "\n",
-    "client = Opik()\n",
-    "\n",
-    "class CheckModerated(base_metric.BaseMetric):\n",
-    "    def __init__(self, name: str):\n",
-    "        self.name = name\n",
-    "\n",
-    "    def score(self, moderation_score, moderation_reason, expected_moderation_score, **kwargs):\n",
-    "        moderation_score = \"moderated\" if moderation_score > 0.5 else \"not_moderated\"\n",
-    "\n",
-    "        return score_result.ScoreResult(\n",
-    "            value= None if moderation_score is None else moderation_score == expected_moderation_score,\n",
-    "            name=self.name,\n",
-    "            reason=f\"Got the moderation score of {moderation_score} and expected {expected_moderation_score}\",\n",
-    "            scoring_failed=moderation_score is None\n",
-    "        )\n",
-    "\n",
+    "# Define the evaluation task\n",
     "def evaluation_task(x: DatasetItem):\n",
     "    metric = Moderation()\n",
     "    try:\n",
@@ -188,19 +243,27 @@
     "        moderation_score = None\n",
     "        moderation_reason = str(e)\n",
     "    \n",
+    "    moderation_score = \"moderated\" if metric_score.value > 0.5 else \"not_moderated\"\n",
+    "\n",
     "    return {\n",
-    "        \"moderation_score\": moderation_score,\n",
-    "        \"moderation_reason\": moderation_reason,\n",
-    "        \"expected_moderation_score\": x.expected_output[\"expected_output\"]\n",
+    "        \"output\": moderation_score,\n",
+    "        \"moderation_score\": metric_score.value,\n",
+    "        \"moderation_reason\": metric_score.reason,\n",
+    "        \"reference\": x.expected_output[\"expected_output\"]\n",
     "    }\n",
     "\n",
+    "# Get the dataset\n",
+    "client = Opik()\n",
     "dataset = client.get_dataset(name=\"OpenAIModerationDataset\")\n",
     "\n",
+    "# Define the scoring metric\n",
+    "moderation_metric = Equals(name=\"Correct moderation score\")\n",
+    "\n",
     "res = evaluate(\n",
-    "    experiment_name=\"Check Comet Metric\",\n",
+    "    experiment_name=\"Evaluate Opik moderation metric\",\n",
     "    dataset=dataset,\n",
     "    task=evaluation_task,\n",
-    "    scoring_metrics=[CheckModerated(name=\"Detected Moderation\")]\n",
+    "    scoring_metrics=[moderation_metric]\n",
     ")"
    ]
   },