Update to docs (#232)

comet-ml · Sep 12, 2024 · e5b1af0 · e5b1af0
1 parent 31c057b
commit e5b1af0
Show file tree

Hide file tree

Showing 35 changed files with 290 additions and 202 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -136,8 +136,7 @@ minikube stop
 ```
 Next time you will start the minikube, it will run everything with the same configuration and data you had before.
 
-
-### Contributing to the documentation
+### Contributing to the documentation
 
 The documentation is made up of three main parts:
 

diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb
@@ -6,8 +6,6 @@
    "source": [
     "# Evaluating Opik's Moderation Metric\n",
     "\n",
-    "*This cookbook was created from a Jypyter notebook which can be found [here](TBD).*\n",
-    "\n",
     "For this guide we will be evaluating the Moderation metric included in the LLM Evaluation SDK which will showcase both how to use the `evaluation` functionality in the platform as well as the quality of the Moderation metric included in the SDK."
    ]
   },
@@ -24,7 +22,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -46,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,24 +63,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%pip install opik --upgrade --quiet"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -102,17 +92,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "status_code: 409, body: {'errors': ['Dataset already exists']}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Create dataset\n",
     "from opik import Opik, DatasetItem\n",
@@ -173,57 +155,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Evaluation: 100%|██████████| 50/50 [00:06<00:00,  8.09it/s]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭─ OpenAIModerationDataset (50 samples) ─╮\n",
-       "│                                        │\n",
-       "│ <span style=\"font-weight: bold\">Total time:       </span> 00:00:06            │\n",
-       "│ <span style=\"font-weight: bold\">Number of samples:</span> 50                  │\n",
-       "│                                        │\n",
-       "│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">Correct moderation score: 0.8400 (avg)</span> │\n",
-       "│                                        │\n",
-       "╰────────────────────────────────────────╯\n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "╭─ OpenAIModerationDataset (50 samples) ─╮\n",
-       "│                                        │\n",
-       "│ \u001b[1mTotal time:       \u001b[0m 00:00:06            │\n",
-       "│ \u001b[1mNumber of samples:\u001b[0m 50                  │\n",
-       "│                                        │\n",
-       "│ \u001b[1;32mCorrect moderation score: 0.8400 (avg)\u001b[0m │\n",
-       "│                                        │\n",
-       "╰────────────────────────────────────────╯\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploading results to Opik <span style=\"color: #808000; text-decoration-color: #808000\">...</span> \n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "Uploading results to Opik \u001b[33m...\u001b[0m \n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from opik.evaluation.metrics import Moderation, Equals\n",
     "from opik.evaluation import evaluate\n",

diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.md b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.md
@@ -1,7 +1,5 @@
 # Evaluating Opik's Moderation Metric
 
-*This cookbook was created from a Jypyter notebook which can be found [here](TBD).*
-
 For this guide we will be evaluating the Moderation metric included in the LLM Evaluation SDK which will showcase both how to use the `evaluation` functionality in the platform as well as the quality of the Moderation metric included in the SDK.
 
 ## Creating an account on Comet.com
@@ -38,9 +36,6 @@ First, we will install the necessary libraries and configure the OpenAI API key
 %pip install opik --upgrade --quiet
 ```
 
-    Note: you may need to restart the kernel to use updated packages.
-
-
 
 ```python
 import os
@@ -95,9 +90,6 @@ except Exception as e:
     print(e)
 ```
 
-    status_code: 409, body: {'errors': ['Dataset already exists']}
-
-
 ## Evaluating the moderation metric
 
 In order to evaluate the performance of the Opik moderation metric, we will define:
@@ -153,28 +145,6 @@ res = evaluate(
 )
 ```
 
-    Evaluation: 100%|██████████| 50/50 [00:06<00:00,  8.09it/s]
-
-
-
-<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">╭─ OpenAIModerationDataset (50 samples) ─╮
-│                                        │
-│ <span style="font-weight: bold">Total time:       </span> 00:00:06            │
-│ <span style="font-weight: bold">Number of samples:</span> 50                  │
-│                                        │
-│ <span style="color: #008000; text-decoration-color: #008000; font-weight: bold">Correct moderation score: 0.8400 (avg)</span> │
-│                                        │
-╰────────────────────────────────────────╯
-</pre>
-
-
-
-
-<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">Uploading results to Opik <span style="color: #808000; text-decoration-color: #808000">...</span> 
-</pre>
-
-
-
 We are able to detect ~85% of moderation violations, this can be improved further by providing some additional examples to the model. We can view a breakdown of the results in the Opik UI:
 
 ![Moderation Evaluation](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/cookbook/moderation_metric_cookbook.png)

diff --git a/apps/opik-documentation/documentation/docs/cookbook/ragas.ipynb b/apps/opik-documentation/documentation/docs/cookbook/ragas.ipynb
@@ -149,28 +149,34 @@
    "source": [
     "import asyncio\n",
     "from ragas.integrations.opik import OpikTracer\n",
+    "from ragas.dataset_schema import SingleTurnSample\n",
+    "\n",
     "\n",
     "# Define the scoring function\n",
-    "def compute_metric(opik_tracer, metric, row):\n",
+    "def compute_metric(metric, row):\n",
+    "    row = SingleTurnSample(**row)\n",
+    "\n",
+    "    opik_tracer = OpikTracer()\n",
+    "\n",
     "    async def get_score(opik_tracer, metric, row):\n",
-    "        score = await metric.ascore(row, callbacks=[opik_tracer])\n",
+    "        score = await metric.single_turn_ascore(row, callbacks=[OpikTracer()])\n",
     "        return score\n",
     "\n",
     "    # Run the async function using the current event loop\n",
     "    loop = asyncio.get_event_loop()\n",
-    "    \n",
+    "\n",
     "    result = loop.run_until_complete(get_score(opik_tracer, metric, row))\n",
     "    return result\n",
     "\n",
+    "\n",
     "# Score a simple example\n",
     "row = {\n",
-    "    \"question\": \"What is the capital of France?\",\n",
-    "    \"answer\": \"Paris\",\n",
-    "    \"contexts\": [\"Paris is the capital of France.\", \"Paris is in France.\"]\n",
+    "    \"user_input\": \"What is the capital of France?\",\n",
+    "    \"response\": \"Paris\",\n",
+    "    \"retrieved_contexts\": [\"Paris is the capital of France.\", \"Paris is in France.\"],\n",
     "}\n",
     "\n",
-    "opik_tracer = OpikTracer()\n",
-    "score = compute_metric(opik_tracer, answer_relevancy_metric, row)\n",
+    "score = compute_metric(answer_relevancy_metric, row)\n",
     "print(\"Answer Relevancy score:\", score)"
    ]
   },
@@ -182,7 +188,7 @@
     "\n",
     "#### Score traces\n",
     "\n",
-    "You can score traces by using the `get_current_trace` function to get the current trace and then calling the `log_feedback_score` function.\n",
+    "You can score traces by using the `update_current_trace` function.\n",
     "\n",
     "The advantage of this approach is that the scoring span is added to the trace allowing for a more fine-grained analysis of the RAG pipeline. It will however run the Ragas metric calculation synchronously and so might not be suitable for production use-cases."
    ]
@@ -193,38 +199,43 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from opik import track\n",
-    "from opik.opik_context import get_current_trace\n",
+    "from opik import track, opik_context\n",
+    "\n",
     "\n",
     "@track\n",
     "def retrieve_contexts(question):\n",
     "    # Define the retrieval function, in this case we will hard code the contexts\n",
     "    return [\"Paris is the capital of France.\", \"Paris is in France.\"]\n",
     "\n",
+    "\n",
     "@track\n",
     "def answer_question(question, contexts):\n",
     "    # Define the answer function, in this case we will hard code the answer\n",
     "    return \"Paris\"\n",
     "\n",
+    "\n",
     "@track(name=\"Compute Ragas metric score\", capture_input=False)\n",
     "def compute_rag_score(answer_relevancy_metric, question, answer, contexts):\n",
     "    # Define the score function\n",
-    "    row = {\"question\": question, \"answer\": answer, \"contexts\": contexts}\n",
+    "    row = {\"user_input\": question, \"response\": answer, \"retrieved_contexts\": contexts}\n",
     "    score = compute_metric(answer_relevancy_metric, row)\n",
     "    return score\n",
     "\n",
+    "\n",
     "@track\n",
     "def rag_pipeline(question):\n",
     "    # Define the pipeline\n",
     "    contexts = retrieve_contexts(question)\n",
     "    answer = answer_question(question, contexts)\n",
     "\n",
-    "    trace = get_current_trace()\n",
     "    score = compute_rag_score(answer_relevancy_metric, question, answer, contexts)\n",
-    "    trace.log_feedback_score(\"answer_relevancy\", round(score, 4), category_name=\"ragas\")\n",
-    "    \n",
+    "    opik_context.update_current_trace(\n",
+    "        feedback_scores=[{\"name\": \"answer_relevancy\", \"value\": round(score, 4)}]\n",
+    "    )\n",
+    "\n",
     "    return answer\n",
     "\n",
+    "\n",
     "rag_pipeline(\"What is the capital of France?\")"
    ]
   },
@@ -252,12 +263,23 @@
     "\n",
     "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
     "\n",
+    "# Reformat the dataset to match the schema expected by the Ragas evaluate function\n",
+    "dataset = fiqa_eval[\"baseline\"].select(range(3))\n",
+    "\n",
+    "dataset = dataset.map(\n",
+    "    lambda x: {\n",
+    "        \"user_input\": x[\"question\"],\n",
+    "        \"reference\": x[\"ground_truths\"][0],\n",
+    "        \"retrieved_contexts\": x[\"contexts\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
     "opik_tracer_eval = OpikTracer(tags=[\"ragas_eval\"], metadata={\"evaluation_run\": True})\n",
     "\n",
     "result = evaluate(\n",
-    "    fiqa_eval[\"baseline\"].select(range(3)),\n",
+    "    dataset,\n",
     "    metrics=[context_precision, faithfulness, answer_relevancy],\n",
-    "    callbacks=[opik_tracer_eval]\n",
+    "    callbacks=[opik_tracer_eval],\n",
     ")\n",
     "\n",
     "print(result)"