Skip to content

Commit

Permalink
Update to docs
Browse files Browse the repository at this point in the history
  • Loading branch information
jverre committed Sep 12, 2024
1 parent 2042d0b commit 98ad705
Show file tree
Hide file tree
Showing 35 changed files with 290 additions and 202 deletions.
3 changes: 1 addition & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,7 @@ minikube stop
```
Next time you will start the minikube, it will run everything with the same configuration and data you had before.


### Contributing to the documentation
### Contributing to the documentation

The documentation is made up of three main parts:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
"source": [
"# Evaluating Opik's Moderation Metric\n",
"\n",
"*This cookbook was created from a Jypyter notebook which can be found [here](TBD).*\n",
"\n",
"For this guide we will be evaluating the Moderation metric included in the LLM Evaluation SDK which will showcase both how to use the `evaluation` functionality in the platform as well as the quality of the Moderation metric included in the SDK."
]
},
Expand All @@ -24,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -46,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -65,24 +63,16 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"outputs": [],
"source": [
"%pip install opik --upgrade --quiet"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -102,17 +92,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"status_code: 409, body: {'errors': ['Dataset already exists']}\n"
]
}
],
"outputs": [],
"source": [
"# Create dataset\n",
"from opik import Opik, DatasetItem\n",
Expand Down Expand Up @@ -173,57 +155,9 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluation: 100%|██████████| 50/50 [00:06<00:00, 8.09it/s]\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭─ OpenAIModerationDataset (50 samples) ─╮\n",
"│ │\n",
"│ <span style=\"font-weight: bold\">Total time: </span> 00:00:06 │\n",
"│ <span style=\"font-weight: bold\">Number of samples:</span> 50 │\n",
"│ │\n",
"│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">Correct moderation score: 0.8400 (avg)</span> │\n",
"│ │\n",
"╰────────────────────────────────────────╯\n",
"</pre>\n"
],
"text/plain": [
"╭─ OpenAIModerationDataset (50 samples) ─╮\n",
"│ │\n",
"\u001b[1mTotal time: \u001b[0m 00:00:06 │\n",
"\u001b[1mNumber of samples:\u001b[0m 50 │\n",
"│ │\n",
"\u001b[1;32mCorrect moderation score: 0.8400 (avg)\u001b[0m │\n",
"│ │\n",
"╰────────────────────────────────────────╯\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploading results to Opik <span style=\"color: #808000; text-decoration-color: #808000\">...</span> \n",
"</pre>\n"
],
"text/plain": [
"Uploading results to Opik \u001b[33m...\u001b[0m \n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"outputs": [],
"source": [
"from opik.evaluation.metrics import Moderation, Equals\n",
"from opik.evaluation import evaluate\n",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# Evaluating Opik's Moderation Metric

*This cookbook was created from a Jypyter notebook which can be found [here](TBD).*

For this guide we will be evaluating the Moderation metric included in the LLM Evaluation SDK which will showcase both how to use the `evaluation` functionality in the platform as well as the quality of the Moderation metric included in the SDK.

## Creating an account on Comet.com
Expand Down Expand Up @@ -38,9 +36,6 @@ First, we will install the necessary libraries and configure the OpenAI API key
%pip install opik --upgrade --quiet
```

Note: you may need to restart the kernel to use updated packages.



```python
import os
Expand Down Expand Up @@ -95,9 +90,6 @@ except Exception as e:
print(e)
```

status_code: 409, body: {'errors': ['Dataset already exists']}


## Evaluating the moderation metric

In order to evaluate the performance of the Opik moderation metric, we will define:
Expand Down Expand Up @@ -153,28 +145,6 @@ res = evaluate(
)
```

Evaluation: 100%|██████████| 50/50 [00:06<00:00, 8.09it/s]



<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">╭─ OpenAIModerationDataset (50 samples) ─╮
│ │
│ <span style="font-weight: bold">Total time: </span> 00:00:06 │
│ <span style="font-weight: bold">Number of samples:</span> 50 │
│ │
│ <span style="color: #008000; text-decoration-color: #008000; font-weight: bold">Correct moderation score: 0.8400 (avg)</span> │
│ │
╰────────────────────────────────────────╯
</pre>




<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">Uploading results to Opik <span style="color: #808000; text-decoration-color: #808000">...</span>
</pre>



We are able to detect ~85% of moderation violations, this can be improved further by providing some additional examples to the model. We can view a breakdown of the results in the Opik UI:

![Moderation Evaluation](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/cookbook/moderation_metric_cookbook.png)
Expand Down
56 changes: 39 additions & 17 deletions apps/opik-documentation/documentation/docs/cookbook/ragas.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -149,28 +149,34 @@
"source": [
"import asyncio\n",
"from ragas.integrations.opik import OpikTracer\n",
"from ragas.dataset_schema import SingleTurnSample\n",
"\n",
"\n",
"# Define the scoring function\n",
"def compute_metric(opik_tracer, metric, row):\n",
"def compute_metric(metric, row):\n",
" row = SingleTurnSample(**row)\n",
"\n",
" opik_tracer = OpikTracer()\n",
"\n",
" async def get_score(opik_tracer, metric, row):\n",
" score = await metric.ascore(row, callbacks=[opik_tracer])\n",
" score = await metric.single_turn_ascore(row, callbacks=[OpikTracer()])\n",
" return score\n",
"\n",
" # Run the async function using the current event loop\n",
" loop = asyncio.get_event_loop()\n",
" \n",
"\n",
" result = loop.run_until_complete(get_score(opik_tracer, metric, row))\n",
" return result\n",
"\n",
"\n",
"# Score a simple example\n",
"row = {\n",
" \"question\": \"What is the capital of France?\",\n",
" \"answer\": \"Paris\",\n",
" \"contexts\": [\"Paris is the capital of France.\", \"Paris is in France.\"]\n",
" \"user_input\": \"What is the capital of France?\",\n",
" \"response\": \"Paris\",\n",
" \"retrieved_contexts\": [\"Paris is the capital of France.\", \"Paris is in France.\"],\n",
"}\n",
"\n",
"opik_tracer = OpikTracer()\n",
"score = compute_metric(opik_tracer, answer_relevancy_metric, row)\n",
"score = compute_metric(answer_relevancy_metric, row)\n",
"print(\"Answer Relevancy score:\", score)"
]
},
Expand All @@ -182,7 +188,7 @@
"\n",
"#### Score traces\n",
"\n",
"You can score traces by using the `get_current_trace` function to get the current trace and then calling the `log_feedback_score` function.\n",
"You can score traces by using the `update_current_trace` function.\n",
"\n",
"The advantage of this approach is that the scoring span is added to the trace allowing for a more fine-grained analysis of the RAG pipeline. It will however run the Ragas metric calculation synchronously and so might not be suitable for production use-cases."
]
Expand All @@ -193,38 +199,43 @@
"metadata": {},
"outputs": [],
"source": [
"from opik import track\n",
"from opik.opik_context import get_current_trace\n",
"from opik import track, opik_context\n",
"\n",
"\n",
"@track\n",
"def retrieve_contexts(question):\n",
" # Define the retrieval function, in this case we will hard code the contexts\n",
" return [\"Paris is the capital of France.\", \"Paris is in France.\"]\n",
"\n",
"\n",
"@track\n",
"def answer_question(question, contexts):\n",
" # Define the answer function, in this case we will hard code the answer\n",
" return \"Paris\"\n",
"\n",
"\n",
"@track(name=\"Compute Ragas metric score\", capture_input=False)\n",
"def compute_rag_score(answer_relevancy_metric, question, answer, contexts):\n",
" # Define the score function\n",
" row = {\"question\": question, \"answer\": answer, \"contexts\": contexts}\n",
" row = {\"user_input\": question, \"response\": answer, \"retrieved_contexts\": contexts}\n",
" score = compute_metric(answer_relevancy_metric, row)\n",
" return score\n",
"\n",
"\n",
"@track\n",
"def rag_pipeline(question):\n",
" # Define the pipeline\n",
" contexts = retrieve_contexts(question)\n",
" answer = answer_question(question, contexts)\n",
"\n",
" trace = get_current_trace()\n",
" score = compute_rag_score(answer_relevancy_metric, question, answer, contexts)\n",
" trace.log_feedback_score(\"answer_relevancy\", round(score, 4), category_name=\"ragas\")\n",
" \n",
" opik_context.update_current_trace(\n",
" feedback_scores=[{\"name\": \"answer_relevancy\", \"value\": round(score, 4)}]\n",
" )\n",
"\n",
" return answer\n",
"\n",
"\n",
"rag_pipeline(\"What is the capital of France?\")"
]
},
Expand Down Expand Up @@ -252,12 +263,23 @@
"\n",
"fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
"\n",
"# Reformat the dataset to match the schema expected by the Ragas evaluate function\n",
"dataset = fiqa_eval[\"baseline\"].select(range(3))\n",
"\n",
"dataset = dataset.map(\n",
" lambda x: {\n",
" \"user_input\": x[\"question\"],\n",
" \"reference\": x[\"ground_truths\"][0],\n",
" \"retrieved_contexts\": x[\"contexts\"],\n",
" }\n",
")\n",
"\n",
"opik_tracer_eval = OpikTracer(tags=[\"ragas_eval\"], metadata={\"evaluation_run\": True})\n",
"\n",
"result = evaluate(\n",
" fiqa_eval[\"baseline\"].select(range(3)),\n",
" dataset,\n",
" metrics=[context_precision, faithfulness, answer_relevancy],\n",
" callbacks=[opik_tracer_eval]\n",
" callbacks=[opik_tracer_eval],\n",
")\n",
"\n",
"print(result)"
Expand Down
Loading

0 comments on commit 98ad705

Please sign in to comment.