Updated moderation metric to use the and not (#826)

Co-authored-by: Aliaksandr Kuzmik <[email protected]>
comet-ml · Dec 6, 2024 · f263d2e · f263d2e
1 parent 5e8fd65
commit f263d2e
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 26 deletions.
diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb
@@ -22,18 +22,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
     "%pip install --upgrade --quiet opik pandas"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "OPIK: Opik is already configured. You can check the settings by viewing the config file at /Users/jacquesverre/.opik.config\n"
+     ]
+    }
+   ],
    "source": [
     "import opik\n",
     "\n",
@@ -51,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -73,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,7 +124,7 @@
     "\n",
     "    dataset_records.append(\n",
     "        {\n",
-    "            \"input\": x[\"prompt\"],\n",
+    "            \"output\": x[\"prompt\"],\n",
     "            \"expected_output\": expected_output,\n",
     "            \"moderated_fields\": moderated_fields,\n",
     "        }\n",
@@ -132,9 +151,71 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]OPIK: Started logging traces to the \"Default Project\" project at https://www.comet.com/opik/jacques-comet/redirect/projects?name=Default%20Project.\n",
+      "Evaluation: 100%|██████████| 50/50 [00:08<00:00,  6.11it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭─ OpenAIModerationDataset (50 samples) ─╮\n",
+       "│                                        │\n",
+       "│ <span style=\"font-weight: bold\">Total time:       </span> 00:00:08            │\n",
+       "│ <span style=\"font-weight: bold\">Number of samples:</span> 50                  │\n",
+       "│                                        │\n",
+       "│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">Correct moderation score: 0.8800 (avg)</span> │\n",
+       "│                                        │\n",
+       "╰────────────────────────────────────────╯\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "╭─ OpenAIModerationDataset (50 samples) ─╮\n",
+       "│                                        │\n",
+       "│ \u001b[1mTotal time:       \u001b[0m 00:00:08            │\n",
+       "│ \u001b[1mNumber of samples:\u001b[0m 50                  │\n",
+       "│                                        │\n",
+       "│ \u001b[1;32mCorrect moderation score: 0.8800 (avg)\u001b[0m │\n",
+       "│                                        │\n",
+       "╰────────────────────────────────────────╯\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploading results to Opik <span style=\"color: #808000; text-decoration-color: #808000\">...</span> \n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Uploading results to Opik \u001b[33m...\u001b[0m \n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">View the results <a href=\"https://www.comet.com/opik/jacques-comet/experiments/01939721-3a29-7f9a-ac8b-737923e30e31/compare?experiments=%5B%2206751b32-5e8a-7276-8000-2da2be3b1cd9%22%5D\" target=\"_blank\">in your Opik dashboard</a>.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "View the results \u001b]8;id=595231;https://www.comet.com/opik/jacques-comet/experiments/01939721-3a29-7f9a-ac8b-737923e30e31/compare?experiments=%5B%2206751b32-5e8a-7276-8000-2da2be3b1cd9%22%5D\u001b\\in your Opik dashboard\u001b]8;;\u001b\\.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "from opik.evaluation.metrics import Moderation, Equals\n",
     "from opik.evaluation import evaluate\n",
@@ -147,7 +228,7 @@
     "def evaluation_task(x: Dict):\n",
     "    metric = Moderation()\n",
     "    try:\n",
-    "        metric_score = metric.score(input=x[\"input\"])\n",
+    "        metric_score = metric.score(output=x[\"output\"])\n",
     "        moderation_score = metric_score.value\n",
     "        moderation_reason = metric_score.reason\n",
     "    except Exception as e:\n",
@@ -172,7 +253,7 @@
     "\n",
     "# Add the prompt template as an experiment configuration\n",
     "experiment_config = {\n",
-    "    \"prompt_template\": generate_query(input=\"{input}\", few_shot_examples=[])\n",
+    "    \"prompt_template\": generate_query(output=\"{output}\", few_shot_examples=[])\n",
     "}\n",
     "\n",
     "res = evaluate(\n",

diff --git a/apps/opik-documentation/documentation/docs/evaluation/metrics/moderation.md b/apps/opik-documentation/documentation/docs/evaluation/metrics/moderation.md
@@ -4,7 +4,7 @@ sidebar_label: Moderation
 
 # Moderation
 
-The Moderation metric allows you to evaluate the appropriateness of the LLM's response to the given input question or prompt. It does this by asking the LLM to rate the appropriateness of the response on a scale of 1 to 10, where 1 is the least appropriate and 10 is the most appropriate.
+The Moderation metric allows you to evaluate the appropriateness of the LLM's response to the given LLM output. It does this by asking the LLM to rate the appropriateness of the response on a scale of 1 to 10, where 1 is the least appropriate and 10 is the most appropriate.
 
 ## How to use the Moderation metric
 
@@ -16,9 +16,7 @@ from opik.evaluation.metrics import Moderation
 metric = Moderation()
 
 metric.score(
-    input="What is the capital of France?",
-    output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
-    context=["France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."],
+    output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage."
 )
 ```
 
@@ -87,7 +85,7 @@ Example response for borderline content:
 
 Analyze the following text and provide your verdict, score, and reason in the specified JSON format:
 
-{input}
+{output}
 ```
 
 with `VERDICT_KEY` being `moderation_score` and `REASON_KEY` being `reason`.
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
@@ -58,12 +58,11 @@ def _init_model(
         else:
             self._model = models_factory.get(model_name=model)
 
-    def score(self, input: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
+    def score(self, output: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
         """
         Calculate the moderation score for the given input-output pair.
 
         Args:
-            input: The input text to be evaluated.
             output: The output text to be evaluated.
             **ignored_kwargs (Any): Additional keyword arguments that are ignored.
 
@@ -72,7 +71,7 @@ def score(self, input: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
             (between 0.0 and 1.0) and a reason for the score.
         """
         llm_query = template.generate_query(
-            input=input, few_shot_examples=self.few_shot_examples
+            output=output, few_shot_examples=self.few_shot_examples
         )
         model_output = self._model.generate_string(
             input=llm_query, response_format=ModerationResponseFormat
@@ -81,7 +80,7 @@ def score(self, input: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
         return self._parse_model_output(model_output)
 
     async def ascore(
-        self, input: str, **ignored_kwargs: Any
+        self, output: str, **ignored_kwargs: Any
     ) -> score_result.ScoreResult:
         """
         Asynchronously calculate the moderation score for the given input-output pair.
@@ -90,7 +89,6 @@ async def ascore(
         please refer to the :meth:`score` method.
 
         Args:
-            input: The input text to be evaluated.
             output: The output text to be evaluated.
             **ignored_kwargs: Additional keyword arguments that are ignored.
 
@@ -99,7 +97,7 @@ async def ascore(
         """
 
         llm_query = template.generate_query(
-            input=input, few_shot_examples=self.few_shot_examples
+            output=output, few_shot_examples=self.few_shot_examples
         )
         model_output = await self._model.agenerate_string(
             input=llm_query, response_format=ModerationResponseFormat

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/template.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/template.py
@@ -2,21 +2,21 @@
 
 
 class FewShotExampleModeration(TypedDict):
-    input: str
+    output: str
     score: float
     reason: str
 
 
 def generate_query(
-    input: str,
+    output: str,
     few_shot_examples: List[FewShotExampleModeration],
 ) -> str:
     examples_str = ""
     if few_shot_examples:
         examples_str = "\n\n".join(
             [
                 f"Example {i+1}:\n"
-                f"Input: {example['input']}\n"
+                f"Output: {example['output']}\n"
                 f"{{\"score\": {example['score']}, \"reason\": \"{example['reason']}\"}}"
                 for i, example in enumerate(few_shot_examples)
             ]
@@ -75,5 +75,5 @@ def generate_query(
 
 Analyze the following text and provide your verdict score, and reason in the specified JSON format:
 
-{input}
+{output}
     """