Jacques/experiment config docs (#258)

* WIP * Update to docs * Update evaluate_hallucination_metric.ipynb * Updated documentation
comet-ml · Sep 17, 2024 · ad2fe2e · ad2fe2e
1 parent 6557b8a
commit ad2fe2e
Show file tree

Hide file tree

Showing 20 changed files with 221 additions and 82 deletions.
diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.ipynb
@@ -97,10 +97,11 @@
    "outputs": [],
    "source": [
     "# Create dataset\n",
-    "from opik import Opik, DatasetItem\n",
+    "import opik\n",
+    "from opik import DatasetItem\n",
     "import pandas as pd\n",
     "\n",
-    "client = Opik()\n",
+    "client = opik.Opik()\n",
     "\n",
     "try:\n",
     "    # Create dataset\n",
@@ -124,8 +125,8 @@
     "    \n",
     "    dataset.insert(dataset_records)\n",
     "\n",
-    "except Exception as e:\n",
-    "    print(e)"
+    "except opik.rest_api.core.ApiError as e:\n",
+    "    print(\"Dataset already exists\")"
    ]
   },
   {
@@ -151,6 +152,7 @@
     "from opik.evaluation.metrics import Hallucination, Equals\n",
     "from opik.evaluation import evaluate\n",
     "from opik import Opik, DatasetItem\n",
+    "from opik.evaluation.metrics.llm_judges.hallucination.template import generate_query\n",
     "\n",
     "# Define the evaluation task\n",
     "def evaluation_task(x: DatasetItem):\n",
@@ -181,11 +183,17 @@
     "# Define the scoring metric\n",
     "check_hallucinated_metric = Equals(name=\"Correct hallucination score\")\n",
     "\n",
+    "# Add the prompt template as an experiment configuration\n",
+    "experiment_config = {\n",
+    "    \"prompt_template\": generate_query(input=\"{input}\",context=\"{context}\",output=\"{output}\",few_shot_examples=[])\n",
+    "}\n",
+    "\n",
     "res = evaluate(\n",
     "    experiment_name=\"Evaluate Opik hallucination metric\",\n",
     "    dataset=dataset,\n",
     "    task=evaluation_task,\n",
-    "    scoring_metrics=[check_hallucinated_metric]\n",
+    "    scoring_metrics=[check_hallucinated_metric],\n",
+    "    experiment_config=experiment_config\n",
     ")"
    ]
   },
@@ -197,11 +205,6 @@
     "\n",
     "![Hallucination Evaluation](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/cookbook/hallucination_metric_cookbook.png)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
   }
  ],
  "metadata": {

diff --git a/...opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.md b/...opik-documentation/documentation/docs/cookbook/evaluate_hallucination_metric.md
@@ -50,10 +50,11 @@ We will be using the [HaluBench dataset](https://huggingface.co/datasets/Patronu
 
 ```python
 # Create dataset
-from opik import Opik, DatasetItem
+import opik
+from opik import DatasetItem
 import pandas as pd
 
-client = Opik()
+client = opik.Opik()
 
 try:
     # Create dataset
@@ -77,8 +78,8 @@ try:
 
     dataset.insert(dataset_records)
 
-except Exception as e:
-    print(e)
+except opik.rest_api.core.ApiError as e:
+    print("Dataset already exists")
 ```
 
 ## Evaluating the hallucination metric
@@ -95,6 +96,7 @@ By defining the evaluation task in this way, we will be able to understand how w
 from opik.evaluation.metrics import Hallucination, Equals
 from opik.evaluation import evaluate
 from opik import Opik, DatasetItem
+from opik.evaluation.metrics.llm_judges.hallucination.template import generate_query
 
 # Define the evaluation task
 def evaluation_task(x: DatasetItem):
@@ -125,16 +127,20 @@ dataset = client.get_dataset(name="HaluBench")
 # Define the scoring metric
 check_hallucinated_metric = Equals(name="Correct hallucination score")
 
+# Add the prompt template as an experiment configuration
+experiment_config = {
+    "prompt_template": generate_query(input="{input}",context="{context}",output="{output}",few_shot_examples=[])
+}
+
 res = evaluate(
     experiment_name="Evaluate Opik hallucination metric",
     dataset=dataset,
     task=evaluation_task,
-    scoring_metrics=[check_hallucinated_metric]
+    scoring_metrics=[check_hallucinated_metric],
+    experiment_config=experiment_config
 )
 ```
 
 We can see that the hallucination metric is able to detect ~80% of the hallucinations contained in the dataset and we can see the specific items where hallucinations were not detected.
 
 ![Hallucination Evaluation](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/cookbook/hallucination_metric_cookbook.png)
-
-
diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb
@@ -97,12 +97,13 @@
    "outputs": [],
    "source": [
     "# Create dataset\n",
-    "from opik import Opik, DatasetItem\n",
+    "import opik\n",
+    "from opik import DatasetItem\n",
     "import pandas as pd\n",
     "import requests\n",
     "from io import BytesIO\n",
     "\n",
-    "client = Opik()\n",
+    "client = opik.Opik()\n",
     "try:\n",
     "    # Create dataset\n",
     "    dataset = client.create_dataset(name=\"OpenAIModerationDataset\", description=\"OpenAI Moderation Dataset\")\n",
@@ -133,8 +134,8 @@
     "    \n",
     "    dataset.insert(dataset_records)\n",
     "\n",
-    "except Exception as e:\n",
-    "    print(e)"
+    "except opik.rest_api.core.ApiError as e:\n",
+    "    print(\"Dataset already exists\")"
    ]
   },
   {
@@ -162,7 +163,7 @@
     "from opik.evaluation.metrics import Moderation, Equals\n",
     "from opik.evaluation import evaluate\n",
     "from opik import Opik, DatasetItem\n",
-    "\n",
+    "from opik.evaluation.metrics.llm_judges.moderation.template import generate_query\n",
     "# Define the evaluation task\n",
     "def evaluation_task(x: DatasetItem):\n",
     "    metric = Moderation()\n",
@@ -193,11 +194,17 @@
     "# Define the scoring metric\n",
     "moderation_metric = Equals(name=\"Correct moderation score\")\n",
     "\n",
+    "# Add the prompt template as an experiment configuration\n",
+    "experiment_config = {\n",
+    "    \"prompt_template\": generate_query(input=\"{input}\",context=\"{context}\",output=\"{output}\",few_shot_examples=[])\n",
+    "}\n",
+    "\n",
     "res = evaluate(\n",
     "    experiment_name=\"Evaluate Opik moderation metric\",\n",
     "    dataset=dataset,\n",
     "    task=evaluation_task,\n",
-    "    scoring_metrics=[moderation_metric]\n",
+    "    scoring_metrics=[moderation_metric],\n",
+    "    experiment_config=experiment_config\n",
     ")"
    ]
   },

diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.md b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.md
@@ -50,12 +50,13 @@ We will be using the [OpenAI Moderation API Release dataset](https://github.com/
 
 ```python
 # Create dataset
-from opik import Opik, DatasetItem
+import opik
+from opik import DatasetItem
 import pandas as pd
 import requests
 from io import BytesIO
 
-client = Opik()
+client = opik.Opik()
 try:
     # Create dataset
     dataset = client.create_dataset(name="OpenAIModerationDataset", description="OpenAI Moderation Dataset")
@@ -86,8 +87,8 @@ try:
 
     dataset.insert(dataset_records)
 
-except Exception as e:
-    print(e)
+except opik.rest_api.core.ApiError as e:
+    print("Dataset already exists")
 ```
 
 ## Evaluating the moderation metric
@@ -106,7 +107,7 @@ We can use the Opik SDK to compute a moderation score for each item in the datas
 from opik.evaluation.metrics import Moderation, Equals
 from opik.evaluation import evaluate
 from opik import Opik, DatasetItem
-
+from opik.evaluation.metrics.llm_judges.moderation.template import generate_query
 # Define the evaluation task
 def evaluation_task(x: DatasetItem):
     metric = Moderation()
@@ -137,11 +138,17 @@ dataset = client.get_dataset(name="OpenAIModerationDataset")
 # Define the scoring metric
 moderation_metric = Equals(name="Correct moderation score")
 
+# Add the prompt template as an experiment configuration
+experiment_config = {
+    "prompt_template": generate_query(input="{input}",context="{context}",output="{output}",few_shot_examples=[])
+}
+
 res = evaluate(
     experiment_name="Evaluate Opik moderation metric",
     dataset=dataset,
     task=evaluation_task,
-    scoring_metrics=[moderation_metric]
+    scoring_metrics=[moderation_metric],
+    experiment_config=experiment_config
 )
 ```
 

diff --git a/apps/opik-documentation/documentation/docs/cookbook/langchain.ipynb b/apps/opik-documentation/documentation/docs/cookbook/langchain.ipynb
@@ -177,19 +177,19 @@
    "outputs": [],
    "source": [
     "# Create the synthetic dataset\n",
-    "from opik import Opik\n",
+    "import opik\n",
     "from opik import DatasetItem\n",
     "\n",
     "synthetic_questions = json.loads(completion.choices[0].message.content)[\"result\"]\n",
     "\n",
-    "client = Opik()\n",
+    "client = opik.Opik()\n",
     "try:\n",
     "    dataset = client.create_dataset(name=\"synthetic_questions\")\n",
     "    dataset.insert([\n",
     "        DatasetItem(input={\"question\": question}) for question in synthetic_questions\n",
     "    ])\n",
-    "except Exception as e:\n",
-    "    pass"
+    "except opik.rest_api.core.ApiError as e:\n",
+    "    print(\"Dataset already exists\")"
    ]
   },
   {

diff --git a/apps/opik-documentation/documentation/docs/cookbook/langchain.md b/apps/opik-documentation/documentation/docs/cookbook/langchain.md
@@ -117,19 +117,19 @@ Now that we have our synthetic dataset, we can create a dataset in Comet and ins
 
 ```python
 # Create the synthetic dataset
-from opik import Opik
+import opik
 from opik import DatasetItem
 
 synthetic_questions = json.loads(completion.choices[0].message.content)["result"]
 
-client = Opik()
+client = opik.Opik()
 try:
     dataset = client.create_dataset(name="synthetic_questions")
     dataset.insert([
         DatasetItem(input={"question": question}) for question in synthetic_questions
     ])
-except Exception as e:
-    pass
+except opik.rest_api.core.ApiError as e:
+    print("Dataset already exists")
 ```
 
 ## Creating a LangChain chain

diff --git a/apps/opik-documentation/documentation/docs/cookbook/openai.ipynb b/apps/opik-documentation/documentation/docs/cookbook/openai.ipynb
@@ -126,7 +126,7 @@
    "source": [
     "The prompt and response messages are automatically logged to Opik and can be viewed in the UI.\n",
     "\n",
-    "![OpenAI Integration](/img/cookbook/openai_trace_cookbook.png)"
+    "![OpenAI Integration](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/cookbook/openai_trace_cookbook.png)"
    ]
   },
   {

diff --git a/apps/opik-documentation/documentation/docs/cookbook/openai.md b/apps/opik-documentation/documentation/docs/cookbook/openai.md
@@ -76,7 +76,7 @@ print(completion.choices[0].message.content)
 
 The prompt and response messages are automatically logged to Opik and can be viewed in the UI.
 
-![OpenAI Integration](/img/cookbook/openai_trace_cookbook.png)
+![OpenAI Integration](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/cookbook/openai_trace_cookbook.png)
 
 ## Using it with the `track` decorator
 

diff --git a/apps/opik-documentation/documentation/docs/evaluation/concepts.md b/apps/opik-documentation/documentation/docs/evaluation/concepts.md
@@ -0,0 +1,90 @@
+---
+sidebar_position: 1
+sidebar_label: Concepts
+---
+
+# Evaluation Concepts
+
+:::tip
+If you want to jump straight to running evaluations, you can head to the [Evaluate your LLM application](/docs/evaluation/evaluate_your_llm.md) section.
+:::
+
+When working with LLM applications, the bottleneck to iterating faster is often the evaluation process. While it is possible to manually review your LLM application's output, this process is slow and not scalable. Instead of manually reviewing your LLM application's output, Opik allows you to automate the evaluation of your LLM application.
+
+In order to understand how to run evaluations in Opik, it is important to first become familiar with the concepts of:
+
+1. **Dataset**: A dataset is a collection of samples that your LLM application will be evaluated on. Datasets only store the input and expected outputs for each sample, the output from your LLM application will be computed and scored during the evaluation process.
+2. **Experiment**: An experiment is a single evaluation of your LLM application. During an experiment, we process each dataset item, compute the output based on your LLM application and then score the output.
+
+![Evaluation Concepts](/img/evaluation/evaluation_concepts.png)
+
+In this section, we will walk through all the concepts associated with Opik's evaluation framework.
+
+## Datasets
+
+The first step in automating the evaluation of your LLM application is to create a dataset which is a collection of samples that your LLM application will be evaluated on. Each dataset is made up of Dataset Items which store the input, expected output and other metadata for a single sample.
+
+Given the importance of datasets in the evaluation process, teams often spend a significant amount of time curating and preparing their datasets. There are three main ways to create a dataset:
+
+1. **Manually curating examples**: As a first step, you can manually curate a set of examples based on your knowledge of the application you are building. You can also leverage subject matter experts to help in the creation of the dataset.
+
+2. **Using synthetic data**: If you don't have enough data to create a diverse set of examples, you can turn to synthetic data generation tools to help you create a dataset. The [LangChain cookbook](/docs/cookbook/langchain.md) has a great example of how to use synthetic data generation tools to create a dataset.
+
+3. **Leveraging production data**: If you application is in production, you can leverage the data that is being generated to augment your dataset. While this is often not the first step in creating a dataset, it can be a great way to to enrich your dataset with real world data.
+
+    If you are using Opik for production monitoring, you can easily add traces to your dataset by selecting them in the UI and selecting `Add to dataset` in the `Actions` dropdown.
+
+
+:::tip
+You can learn more about how to manage your datasets in Opik in the [Manage Datasets](/docs/evaluation/manage_datasets.md) section.
+:::
+
+## Experiments
+
+Experiments are the core building block of the Opik evaluation framework. Each time you run a new evaluation, a new experiment is created. Each experiment is made up of two main components:
+
+1. **Experiment Configuration**: The configuration object associated with each experiment allows you to track some metadata, often you would use this field to store the prompt template used for a given experiment for example.
+2. **Experiment Items**: Experiment items store the input, expected output, actual output and feedback scores for each dataset sample that was processed during an experiment.
+
+In addition, for each experiment you will be able to see the average scores for each metric.
+
+### Experiment Configuration
+
+One of the main advantages of having an automated evaluation framework is the ability to iterate quickly. The main drawback is that it can become difficult to track what has changed between two different iterations of an experiment.
+
+The experiment configuration object allows you to store some metadata associated with a given experiment. This is useful for tracking things like the prompt template used for a given experiment, the model used, the temperature, etc.
+
+You can then compare the configuration of two different experiments from the Opik UI to see what has changed.
+
+![Experiment Configuration](/img/evaluation/compare_experiment_config.png)
+
+### Experiment Items
+
+Experiment items store the input, expected output, actual output and feedback scores for each dataset sample that was processed during an experiment. In addition, a trace is associated with each item to allow you to easily understand why a given item scored the way it did.
+
+![Experiment Items](/img/evaluation/experiment_items.png)
+
+## Running an evaluation
+
+When you run an evaluation, you will need to know the following:
+
+1. Dataset: The dataset you want to run the evaluation on.
+2. Evaluation task: This maps the inputs stored in the dataset to the output you would like to score. The evaluation task is typically the LLM application you are building.
+3. Metrics: The metrics you would like to use when scoring the outputs of your LLM
+
+You can then run the evaluation using the `evaluate` function:
+
+```python
+from opik import evaluate
+
+evaluate(
+    dataset=dataset,
+    evaluation_task=evaluation_task,
+    metrics=metrics,
+    experiment_config={"prompt_template": "..."},
+)
+```
+
+:::tip
+You can find a full tutorial on defining evaluations in the [Evaluate your LLM application](/docs/evaluation/evaluate_your_llm.md) section.
+:::