Skip to content

Commit

Permalink
Update to docs (#230)
Browse files Browse the repository at this point in the history
  • Loading branch information
jverre authored Sep 12, 2024
1 parent b93b8b8 commit 2b90c0c
Show file tree
Hide file tree
Showing 19 changed files with 457 additions and 196 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,12 @@
"source": [
"## Evaluating the hallucination metric\n",
"\n",
"We can use the Opik SDK to compute a hallucination score for each item in the dataset:"
"In order to evaluate the performance of the Opik hallucination metric, we will define:\n",
"\n",
"- Evaluation task: Our evaluation task will use the data in the Dataset to return a hallucination score computed using the Opik hallucination metric.\n",
"- Scoring metric: We will use the `Equals` metric to check if the hallucination score computed matches the expected output.\n",
"\n",
"By defining the evaluation task in this way, we will be able to understand how well Opik's hallucination metric is able to detect hallucinations in the dataset."
]
},
{
Expand All @@ -143,26 +148,11 @@
"metadata": {},
"outputs": [],
"source": [
"from opik.evaluation.metrics import Hallucination\n",
"from opik.evaluation.metrics import Hallucination, Equals\n",
"from opik.evaluation import evaluate\n",
"from opik.evaluation.metrics import base_metric, score_result\n",
"from opik import Opik, DatasetItem\n",
"import pandas as pd\n",
"\n",
"client = Opik()\n",
"\n",
"class CheckHallucinated(base_metric.BaseMetric):\n",
" def __init__(self, name: str):\n",
" self.name = name\n",
"\n",
" def score(self, hallucination_score, expected_hallucination_score, **kwargs):\n",
" return score_result.ScoreResult(\n",
" value= None if hallucination_score is None else hallucination_score == expected_hallucination_score,\n",
" name=self.name,\n",
" reason=f\"Got the hallucination score of {hallucination_score} and expected {expected_hallucination_score}\",\n",
" scoring_failed=hallucination_score is None\n",
" )\n",
"\n",
"# Define the evaluation task\n",
"def evaluation_task(x: DatasetItem):\n",
" metric = Hallucination()\n",
" try:\n",
Expand All @@ -179,18 +169,23 @@
" hallucination_reason = str(e)\n",
" \n",
" return {\n",
" \"hallucination_score\": \"FAIL\" if hallucination_score == 1 else \"PASS\",\n",
" \"output\": \"FAIL\" if hallucination_score == 1 else \"PASS\",\n",
" \"hallucination_reason\": hallucination_reason,\n",
" \"expected_hallucination_score\": x.expected_output[\"expected_output\"]\n",
" \"reference\": x.expected_output[\"expected_output\"]\n",
" }\n",
"\n",
"# Get the dataset\n",
"client = Opik()\n",
"dataset = client.get_dataset(name=\"HaluBench\")\n",
"\n",
"# Define the scoring metric\n",
"check_hallucinated_metric = Equals(name=\"Correct hallucination score\")\n",
"\n",
"res = evaluate(\n",
" experiment_name=\"Check Comet Metric\",\n",
" experiment_name=\"Evaluate Opik hallucination metric\",\n",
" dataset=dataset,\n",
" task=evaluation_task,\n",
" scoring_metrics=[CheckHallucinated(name=\"Detected hallucination\")]\n",
" scoring_metrics=[check_hallucinated_metric]\n",
")"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ For this guide we will be evaluating the Hallucination metric included in the LL
import os
import getpass

os.environ["OPIK_API_KEY"] = getpass.getpass("Opik API Key: ")
os.environ["OPIK_WORKSPACE"] = input("Comet workspace (often the same as your username): ")
if "OPIK_API_KEY" not in os.environ:
os.environ["OPIK_API_KEY"] = getpass.getpass("Opik API Key: ")
if "OPIK_WORKSPACE" not in os.environ:
os.environ["OPIK_WORKSPACE"] = input("Comet workspace (often the same as your username): ")
```

If you are running the Opik platform locally, simply set:
Expand All @@ -31,16 +33,16 @@ First, we will install the necessary libraries, configure the OpenAI API key and


```python
%pip install pyarrow fsspec huggingface_hub --quiet
%pip install opik pyarrow fsspec huggingface_hub --upgrade --quiet
```


```python
# Configure OpenAI
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API key: ")
if "OPENAI_API_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
```

We will be using the [HaluBench dataset](https://huggingface.co/datasets/PatronusAI/HaluBench?library=pandas) which according to this [paper](https://arxiv.org/pdf/2407.08488) GPT-4o detects 87.9% of hallucinations. The first step will be to create a dataset in the platform so we can keep track of the results of the evaluation.
Expand All @@ -59,7 +61,7 @@ try:

# Insert items into dataset
df = pd.read_parquet("hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet")
df = df.sample(n=500, random_state=42)
df = df.sample(n=50, random_state=42)

dataset_records = [
DatasetItem(
Expand All @@ -81,30 +83,20 @@ except Exception as e:

## Evaluating the hallucination metric

We can use the Opik SDK to compute a hallucination score for each item in the dataset:
In order to evaluate the performance of the Opik hallucination metric, we will define:

- Evaluation task: Our evaluation task will use the data in the Dataset to return a hallucination score computed using the Opik hallucination metric.
- Scoring metric: We will use the `Equals` metric to check if the hallucination score computed matches the expected output.

By defining the evaluation task in this way, we will be able to understand how well Opik's hallucination metric is able to detect hallucinations in the dataset.


```python
from opik.evaluation.metrics import Hallucination
from opik.evaluation.metrics import Hallucination, Equals
from opik.evaluation import evaluate
from opik.evaluation.metrics import base_metric, score_result
from opik import Opik, DatasetItem
import pandas as pd

client = Opik()

class CheckHallucinated(base_metric.BaseMetric):
def __init__(self, name: str):
self.name = name

def score(self, hallucination_score, expected_hallucination_score, **kwargs):
return score_result.ScoreResult(
value= None if hallucination_score is None else hallucination_score == expected_hallucination_score,
name=self.name,
reason=f"Got the hallucination score of {hallucination_score} and expected {expected_hallucination_score}",
scoring_failed=hallucination_score is None
)

# Define the evaluation task
def evaluation_task(x: DatasetItem):
metric = Hallucination()
try:
Expand All @@ -121,23 +113,28 @@ def evaluation_task(x: DatasetItem):
hallucination_reason = str(e)

return {
"hallucination_score": "FAIL" if hallucination_score == 1 else "PASS",
"output": "FAIL" if hallucination_score == 1 else "PASS",
"hallucination_reason": hallucination_reason,
"expected_hallucination_score": x.expected_output["expected_output"]
"reference": x.expected_output["expected_output"]
}

# Get the dataset
client = Opik()
dataset = client.get_dataset(name="HaluBench")

# Define the scoring metric
check_hallucinated_metric = Equals(name="Correct hallucination score")

res = evaluate(
experiment_name="Check Comet Metric",
experiment_name="Evaluate Opik hallucination metric",
dataset=dataset,
task=evaluation_task,
scoring_metrics=[CheckHallucinated(name="Detected hallucination")]
scoring_metrics=[check_hallucinated_metric]
)
```

We can see that the hallucination metric is able to detect ~80% of the hallucinations contained in the dataset and we can see the specific items where hallucinations were not detected.

![Hallucination Evaluation](/img/cookbook/hallucination_metric_cookbook.png)
![Hallucination Evaluation](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/cookbook/hallucination_metric_cookbook.png)


Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -46,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -65,16 +65,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install opik --upgrade --quiet"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -94,9 +102,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"status_code: 409, body: {'errors': ['Dataset already exists']}\n"
]
}
],
"source": [
"# Create dataset\n",
"from opik import Opik, DatasetItem\n",
Expand Down Expand Up @@ -145,36 +161,75 @@
"source": [
"## Evaluating the moderation metric\n",
"\n",
"In order to evaluate the performance of the Opik moderation metric, we will define:\n",
"\n",
"- Evaluation task: Our evaluation task will use the data in the Dataset to return a moderation score computed using the Opik moderation metric.\n",
"- Scoring metric: We will use the `Equals` metric to check if the moderation score computed matches the expected output.\n",
"\n",
"By defining the evaluation task in this way, we will be able to understand how well Opik's moderation metric is able to detect moderation violations in the dataset.\n",
"\n",
"We can use the Opik SDK to compute a moderation score for each item in the dataset:"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluation: 100%|██████████| 50/50 [00:06<00:00, 8.09it/s]\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭─ OpenAIModerationDataset (50 samples) ─╮\n",
"│ │\n",
"│ <span style=\"font-weight: bold\">Total time: </span> 00:00:06 │\n",
"│ <span style=\"font-weight: bold\">Number of samples:</span> 50 │\n",
"│ │\n",
"│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">Correct moderation score: 0.8400 (avg)</span> │\n",
"│ │\n",
"╰────────────────────────────────────────╯\n",
"</pre>\n"
],
"text/plain": [
"╭─ OpenAIModerationDataset (50 samples) ─╮\n",
"│ │\n",
"\u001b[1mTotal time: \u001b[0m 00:00:06 │\n",
"\u001b[1mNumber of samples:\u001b[0m 50 │\n",
"│ │\n",
"\u001b[1;32mCorrect moderation score: 0.8400 (avg)\u001b[0m │\n",
"│ │\n",
"╰────────────────────────────────────────╯\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploading results to Opik <span style=\"color: #808000; text-decoration-color: #808000\">...</span> \n",
"</pre>\n"
],
"text/plain": [
"Uploading results to Opik \u001b[33m...\u001b[0m \n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from opik.evaluation.metrics import Moderation\n",
"from opik.evaluation.metrics import Moderation, Equals\n",
"from opik.evaluation import evaluate\n",
"from opik.evaluation.metrics import base_metric, score_result\n",
"from opik import Opik, DatasetItem\n",
"\n",
"client = Opik()\n",
"\n",
"class CheckModerated(base_metric.BaseMetric):\n",
" def __init__(self, name: str):\n",
" self.name = name\n",
"\n",
" def score(self, moderation_score, moderation_reason, expected_moderation_score, **kwargs):\n",
" moderation_score = \"moderated\" if moderation_score > 0.5 else \"not_moderated\"\n",
"\n",
" return score_result.ScoreResult(\n",
" value= None if moderation_score is None else moderation_score == expected_moderation_score,\n",
" name=self.name,\n",
" reason=f\"Got the moderation score of {moderation_score} and expected {expected_moderation_score}\",\n",
" scoring_failed=moderation_score is None\n",
" )\n",
"\n",
"# Define the evaluation task\n",
"def evaluation_task(x: DatasetItem):\n",
" metric = Moderation()\n",
" try:\n",
Expand All @@ -188,19 +243,27 @@
" moderation_score = None\n",
" moderation_reason = str(e)\n",
" \n",
" moderation_score = \"moderated\" if metric_score.value > 0.5 else \"not_moderated\"\n",
"\n",
" return {\n",
" \"moderation_score\": moderation_score,\n",
" \"moderation_reason\": moderation_reason,\n",
" \"expected_moderation_score\": x.expected_output[\"expected_output\"]\n",
" \"output\": moderation_score,\n",
" \"moderation_score\": metric_score.value,\n",
" \"moderation_reason\": metric_score.reason,\n",
" \"reference\": x.expected_output[\"expected_output\"]\n",
" }\n",
"\n",
"# Get the dataset\n",
"client = Opik()\n",
"dataset = client.get_dataset(name=\"OpenAIModerationDataset\")\n",
"\n",
"# Define the scoring metric\n",
"moderation_metric = Equals(name=\"Correct moderation score\")\n",
"\n",
"res = evaluate(\n",
" experiment_name=\"Check Comet Metric\",\n",
" experiment_name=\"Evaluate Opik moderation metric\",\n",
" dataset=dataset,\n",
" task=evaluation_task,\n",
" scoring_metrics=[CheckModerated(name=\"Detected Moderation\")]\n",
" scoring_metrics=[moderation_metric]\n",
")"
]
},
Expand Down
Loading

0 comments on commit 2b90c0c

Please sign in to comment.