diff --git a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb index 62c4fe1bac..1e2c8fd885 100644 --- a/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb +++ b/apps/opik-documentation/documentation/docs/cookbook/evaluate_moderation_metric.ipynb @@ -22,18 +22,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install --upgrade --quiet opik pandas" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OPIK: Opik is already configured. You can check the settings by viewing the config file at /Users/jacquesverre/.opik.config\n" + ] + } + ], "source": [ "import opik\n", "\n", @@ -51,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -105,7 +124,7 @@ "\n", " dataset_records.append(\n", " {\n", - " \"input\": x[\"prompt\"],\n", + " \"output\": x[\"prompt\"],\n", " \"expected_output\": expected_output,\n", " \"moderated_fields\": moderated_fields,\n", " }\n", @@ -132,9 +151,71 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Evaluation: 0%| | 0/50 [00:00╭─ OpenAIModerationDataset (50 samples) ─╮\n", + "│ │\n", + "│ Total time: 00:00:08 │\n", + "│ Number of samples: 50 │\n", + "│ │\n", + "│ Correct moderation score: 0.8800 (avg) │\n", + "│ │\n", + "╰────────────────────────────────────────╯\n", + "\n" + ], + "text/plain": [ + "╭─ OpenAIModerationDataset (50 samples) ─╮\n", + "│ │\n", + "│ \u001b[1mTotal time: \u001b[0m 00:00:08 │\n", + "│ \u001b[1mNumber of samples:\u001b[0m 50 │\n", + "│ │\n", + "│ \u001b[1;32mCorrect moderation score: 0.8800 (avg)\u001b[0m │\n", + "│ │\n", + "╰────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Uploading results to Opik ... \n",
+       "
\n" + ], + "text/plain": [ + "Uploading results to Opik \u001b[33m...\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
View the results in your Opik dashboard.\n",
+       "
\n" + ], + "text/plain": [ + "View the results \u001b]8;id=595231;https://www.comet.com/opik/jacques-comet/experiments/01939721-3a29-7f9a-ac8b-737923e30e31/compare?experiments=%5B%2206751b32-5e8a-7276-8000-2da2be3b1cd9%22%5D\u001b\\in your Opik dashboard\u001b]8;;\u001b\\.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from opik.evaluation.metrics import Moderation, Equals\n", "from opik.evaluation import evaluate\n", @@ -147,7 +228,7 @@ "def evaluation_task(x: Dict):\n", " metric = Moderation()\n", " try:\n", - " metric_score = metric.score(input=x[\"input\"])\n", + " metric_score = metric.score(output=x[\"output\"])\n", " moderation_score = metric_score.value\n", " moderation_reason = metric_score.reason\n", " except Exception as e:\n", @@ -172,7 +253,7 @@ "\n", "# Add the prompt template as an experiment configuration\n", "experiment_config = {\n", - " \"prompt_template\": generate_query(input=\"{input}\", few_shot_examples=[])\n", + " \"prompt_template\": generate_query(output=\"{output}\", few_shot_examples=[])\n", "}\n", "\n", "res = evaluate(\n", diff --git a/apps/opik-documentation/documentation/docs/evaluation/metrics/moderation.md b/apps/opik-documentation/documentation/docs/evaluation/metrics/moderation.md index 8ea6ec9eb9..0ea20a31de 100644 --- a/apps/opik-documentation/documentation/docs/evaluation/metrics/moderation.md +++ b/apps/opik-documentation/documentation/docs/evaluation/metrics/moderation.md @@ -4,7 +4,7 @@ sidebar_label: Moderation # Moderation -The Moderation metric allows you to evaluate the appropriateness of the LLM's response to the given input question or prompt. It does this by asking the LLM to rate the appropriateness of the response on a scale of 1 to 10, where 1 is the least appropriate and 10 is the most appropriate. +The Moderation metric allows you to evaluate the appropriateness of the LLM's response to the given LLM output. It does this by asking the LLM to rate the appropriateness of the response on a scale of 1 to 10, where 1 is the least appropriate and 10 is the most appropriate. ## How to use the Moderation metric @@ -16,9 +16,7 @@ from opik.evaluation.metrics import Moderation metric = Moderation() metric.score( - input="What is the capital of France?", - output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.", - context=["France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."], + output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage." ) ``` @@ -87,7 +85,7 @@ Example response for borderline content: Analyze the following text and provide your verdict, score, and reason in the specified JSON format: -{input} +{output} ``` with `VERDICT_KEY` being `moderation_score` and `REASON_KEY` being `reason`. diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py index 543118b95f..64678d04e2 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py @@ -58,12 +58,11 @@ def _init_model( else: self._model = models_factory.get(model_name=model) - def score(self, input: str, **ignored_kwargs: Any) -> score_result.ScoreResult: + def score(self, output: str, **ignored_kwargs: Any) -> score_result.ScoreResult: """ Calculate the moderation score for the given input-output pair. Args: - input: The input text to be evaluated. output: The output text to be evaluated. **ignored_kwargs (Any): Additional keyword arguments that are ignored. @@ -72,7 +71,7 @@ def score(self, input: str, **ignored_kwargs: Any) -> score_result.ScoreResult: (between 0.0 and 1.0) and a reason for the score. """ llm_query = template.generate_query( - input=input, few_shot_examples=self.few_shot_examples + output=output, few_shot_examples=self.few_shot_examples ) model_output = self._model.generate_string( input=llm_query, response_format=ModerationResponseFormat @@ -81,7 +80,7 @@ def score(self, input: str, **ignored_kwargs: Any) -> score_result.ScoreResult: return self._parse_model_output(model_output) async def ascore( - self, input: str, **ignored_kwargs: Any + self, output: str, **ignored_kwargs: Any ) -> score_result.ScoreResult: """ Asynchronously calculate the moderation score for the given input-output pair. @@ -90,7 +89,6 @@ async def ascore( please refer to the :meth:`score` method. Args: - input: The input text to be evaluated. output: The output text to be evaluated. **ignored_kwargs: Additional keyword arguments that are ignored. @@ -99,7 +97,7 @@ async def ascore( """ llm_query = template.generate_query( - input=input, few_shot_examples=self.few_shot_examples + output=output, few_shot_examples=self.few_shot_examples ) model_output = await self._model.agenerate_string( input=llm_query, response_format=ModerationResponseFormat diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/template.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/template.py index e7c9c242e7..158bae98ee 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/template.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/template.py @@ -2,13 +2,13 @@ class FewShotExampleModeration(TypedDict): - input: str + output: str score: float reason: str def generate_query( - input: str, + output: str, few_shot_examples: List[FewShotExampleModeration], ) -> str: examples_str = "" @@ -16,7 +16,7 @@ def generate_query( examples_str = "\n\n".join( [ f"Example {i+1}:\n" - f"Input: {example['input']}\n" + f"Output: {example['output']}\n" f"{{\"score\": {example['score']}, \"reason\": \"{example['reason']}\"}}" for i, example in enumerate(few_shot_examples) ] @@ -75,5 +75,5 @@ def generate_query( Analyze the following text and provide your verdict score, and reason in the specified JSON format: -{input} +{output} """