Merge pull request #1123 from kritinv/image

Image Metrics
confident-ai · Oct 25, 2024 · b63beed · b63beed
2 parents dee83d8 + c2ff80e
commit b63beed
Show file tree

Hide file tree

Showing 15 changed files with 565 additions and 183 deletions.
diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
@@ -16,7 +16,8 @@
 from .contextual_precision.contextual_precision import ContextualPrecisionMetric
 from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric
 from .tool_correctness.tool_correctness import ToolCorrectnessMetric
-from .viescore.viescore import VIEScore, VIEScoreTask
+from .text_to_image.text_to_image import TextToImageMetric
+from .image_editing.image_editing import ImageEditingMetric
 from .conversation_relevancy.conversation_relevancy import (
     ConversationRelevancyMetric,
 )

diff --git a/deepeval/metrics/viescore/__init__.py → deepeval/metrics/image_editing/__init__.py b/deepeval/metrics/viescore/__init__.py → deepeval/metrics/image_editing/__init__.py
diff --git a/deepeval/metrics/viescore/viescore.py → ...al/metrics/image_editing/image_editing.py b/deepeval/metrics/viescore/viescore.py → ...al/metrics/image_editing/image_editing.py
@@ -5,7 +5,7 @@
 
 from deepeval.metrics import BaseMultimodalMetric
 from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
-from deepeval.metrics.viescore.template import VIEScoreTemplate
+from deepeval.metrics.image_editing.template import ImageEditingTemplate
 from deepeval.utils import get_or_create_event_loop
 from deepeval.metrics.utils import (
     construct_verbose_logs,
@@ -14,8 +14,7 @@
     initialize_multimodal_model,
 )
 from deepeval.models import DeepEvalBaseMLLM
-from deepeval.metrics.viescore.schema import ReasonScore
-from deepeval.metrics.viescore.task import VIEScoreTask
+from deepeval.metrics.image_editing.schema import ReasonScore
 from deepeval.metrics.indicator import metric_progress_indicator
 
 required_params: List[MLLMTestCaseParams] = [
@@ -24,33 +23,26 @@
 ]
 
 
-class VIEScore(BaseMultimodalMetric):
+class ImageEditingMetric(BaseMultimodalMetric):
     def __init__(
         self,
         model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
-        task: VIEScoreTask = VIEScoreTask.TEXT_TO_IMAGE_GENERATION,
         threshold: float = 0.5,
         async_mode: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
-        _include_VIEScore_task_name: bool = True,
     ):
         self.model, self.using_native_model = initialize_multimodal_model(model)
         self.evaluation_model = self.model.get_model_name()
         self.threshold = 1 if strict_mode else threshold
         self.strict_mode = strict_mode
         self.async_mode = async_mode
         self.verbose_mode = verbose_mode
-        self.task = task
-        self._include_VIEScore_task_name = _include_VIEScore_task_name
 
     def measure(
         self, test_case: MLLMTestCase, _show_indicator: bool = True
     ) -> float:
-        if self.task == VIEScoreTask.TEXT_TO_IMAGE_GENERATION:
-            check_mllm_test_case_params(test_case, required_params, 0, 1, self)
-        elif self.task == VIEScoreTask.TEXT_TO_IMAGE_EDITING:
-            check_mllm_test_case_params(test_case, required_params, 1, 1, self)
+        check_mllm_test_case_params(test_case, required_params, 1, 1, self)
 
         self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(self, _show_indicator=_show_indicator):
@@ -102,10 +94,7 @@ async def a_measure(
         test_case: MLLMTestCase,
         _show_indicator: bool = True,
     ) -> float:
-        if self.task == VIEScoreTask.TEXT_TO_IMAGE_GENERATION:
-            check_mllm_test_case_params(test_case, required_params, 0, 1, self)
-        elif self.task == VIEScoreTask.TEXT_TO_IMAGE_EDITING:
-            check_mllm_test_case_params(test_case, required_params, 1, 1, self)
+        check_mllm_test_case_params(test_case, required_params, 1, 1, self)
 
         self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(
@@ -169,13 +158,10 @@ async def _a_evaluate_semantic_consistency(
         actual_image_output: MLLMImage,
     ) -> Tuple[List[int], str]:
         images: List[MLLMImage] = []
-        if self.task == VIEScoreTask.TEXT_TO_IMAGE_GENERATION:
-            images.append(image_input)
-        elif self.task == VIEScoreTask.TEXT_TO_IMAGE_EDITING:
-            images.extend([image_input, actual_image_output])
+        images.extend([image_input, actual_image_output])
         prompt = [
-            VIEScoreTemplate.generate_semantic_consistency_evaluation_results(
-                text_prompt=text_prompt, task=self.task
+            ImageEditingTemplate.generate_semantic_consistency_evaluation_results(
+                text_prompt=text_prompt
             )
         ]
         if self.using_native_model:
@@ -203,13 +189,10 @@ def _evaluate_semantic_consistency(
         actual_image_output: MLLMImage,
     ) -> Tuple[List[int], str]:
         images: List[MLLMImage] = []
-        if self.task == VIEScoreTask.TEXT_TO_IMAGE_GENERATION:
-            images.append(image_input)
-        elif self.task == VIEScoreTask.TEXT_TO_IMAGE_EDITING:
-            images.extend([image_input, actual_image_output])
+        images.extend([image_input, actual_image_output])
         prompt = [
-            VIEScoreTemplate.generate_semantic_consistency_evaluation_results(
-                text_prompt=text_prompt, task=self.task
+            ImageEditingTemplate.generate_semantic_consistency_evaluation_results(
+                text_prompt=text_prompt
             )
         ]
         if self.using_native_model:
@@ -233,7 +216,7 @@ async def _a_evaluate_perceptual_quality(
     ) -> Tuple[List[int], str]:
         images: List[MLLMImage] = [actual_image_output]
         prompt = [
-            VIEScoreTemplate.generate_perceptual_quality_evaluation_results()
+            ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
         ]
         if self.using_native_model:
             res, cost = await self.model.a_generate(prompt + images)
@@ -256,7 +239,7 @@ def _evaluate_perceptual_quality(
     ) -> Tuple[List[int], str]:
         images: List[MLLMImage] = [actual_image_output]
         prompt = [
-            VIEScoreTemplate.generate_perceptual_quality_evaluation_results()
+            ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
         ]
         if self.using_native_model:
             res, cost = self.model.generate(prompt + images)
@@ -304,7 +287,4 @@ def _generate_reason(
 
     @property
     def __name__(self):
-        if self._include_VIEScore_task_name:
-            return f"{self.task.value} (VIEScore)"
-        else:
-            return "VIEScore"
+        return "Image Editing"
diff --git a/deepeval/metrics/viescore/schema.py → deepeval/metrics/image_editing/schema.py b/deepeval/metrics/viescore/schema.py → deepeval/metrics/image_editing/schema.py
diff --git a/deepeval/metrics/image_editing/template.py b/deepeval/metrics/image_editing/template.py
@@ -0,0 +1,65 @@
+import textwrap
+
+
+class ImageEditingTemplate:
+
+    context = textwrap.dedent(
+        """
+        You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
+        All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
+                              
+        You will have to give your output in this way (Keep your reasoning concise and short.):
+        {
+            "score" : [...],
+            "reasoning" : "..."
+        }
+    """
+    )
+
+    @staticmethod
+    def generate_semantic_consistency_evaluation_results(
+        text_prompt: str
+    ):
+        return textwrap.dedent(
+            f"""
+            {ImageEditingTemplate.context}
+
+            RULES:
+                            
+            Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
+            The objective is to evaluate how successfully the editing instruction has been executed in the second image.
+
+            From scale 0 to 10: 
+            A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+            A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+            Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+
+            Editing instruction: {text_prompt}
+        """
+        )
+
+    @staticmethod
+    def generate_perceptual_quality_evaluation_results():
+        return textwrap.dedent(
+            f"""
+            {ImageEditingTemplate.context}
+
+            RULES:
+
+            The image is an AI-generated image.
+            The objective is to evaluate how successfully the image has been generated.
+
+            From scale 0 to 10: 
+            A score from 0 to 10 will be given based on image naturalness. 
+            (
+                0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting. 
+                10 indicates that the image looks natural.
+            )
+            A second score from 0 to 10 will rate the image artifacts. 
+            (
+                0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized. 
+                10 indicates the image has no artifacts.
+            )
+            Put the score in a list such that output score = [naturalness, artifacts]
+        """
+        )
diff --git a/deepeval/metrics/text_to_image/__init__.py b/deepeval/metrics/text_to_image/__init__.py
diff --git a/deepeval/metrics/text_to_image/schema.py b/deepeval/metrics/text_to_image/schema.py
@@ -0,0 +1,7 @@
+from typing import List
+from pydantic import BaseModel, Field
+
+
+class ReasonScore(BaseModel):
+    reasoning: str
+    score: List[float]
diff --git a/deepeval/metrics/text_to_image/template.py b/deepeval/metrics/text_to_image/template.py
@@ -0,0 +1,66 @@
+import textwrap
+
+
+class TextToImageTemplate:
+
+    context = textwrap.dedent(
+        """
+        You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
+        All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
+                              
+        You will have to give your output in this way (Keep your reasoning concise and short.):
+        {
+            "score" : [...],
+            "reasoning" : "..."
+        }
+    """
+    )
+
+    @staticmethod
+    def generate_semantic_consistency_evaluation_results(
+        text_prompt: str
+    ):
+        return textwrap.dedent(
+            f"""
+            {TextToImageTemplate.context}
+
+            RULES:
+                            
+            The image is an AI-generated image according to the text prompt.
+            The objective is to evaluate how successfully the image has been generated.
+
+            From scale 0 to 10: 
+            A score from 0 to 10 will be given based on the success in following the prompt. 
+            (0 indicates that the AI generated image does not follow the prompt at all. 10 indicates the AI generated image follows the prompt perfectly.)
+
+            Put the score in a list such that output score = [score].
+
+            Text Prompt: {text_prompt}
+        """
+        )
+
+    @staticmethod
+    def generate_perceptual_quality_evaluation_results():
+        return textwrap.dedent(
+            f"""
+            {TextToImageTemplate.context}
+
+            RULES:
+
+            The image is an AI-generated image.
+            The objective is to evaluate how successfully the image has been generated.
+
+            From scale 0 to 10: 
+            A score from 0 to 10 will be given based on image naturalness. 
+            (
+                0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting. 
+                10 indicates that the image looks natural.
+            )
+            A second score from 0 to 10 will rate the image artifacts. 
+            (
+                0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized. 
+                10 indicates the image has no artifacts.
+            )
+            Put the score in a list such that output score = [naturalness, artifacts]
+        """
+        )