Skip to content

Commit

Permalink
Merge pull request #1123 from kritinv/image
Browse files Browse the repository at this point in the history
Image Metrics
  • Loading branch information
penguine-ip authored Oct 25, 2024
2 parents dee83d8 + c2ff80e commit b63beed
Show file tree
Hide file tree
Showing 15 changed files with 565 additions and 183 deletions.
3 changes: 2 additions & 1 deletion deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from .contextual_precision.contextual_precision import ContextualPrecisionMetric
from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric
from .tool_correctness.tool_correctness import ToolCorrectnessMetric
from .viescore.viescore import VIEScore, VIEScoreTask
from .text_to_image.text_to_image import TextToImageMetric
from .image_editing.image_editing import ImageEditingMetric
from .conversation_relevancy.conversation_relevancy import (
ConversationRelevancyMetric,
)
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from deepeval.metrics import BaseMultimodalMetric
from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
from deepeval.metrics.viescore.template import VIEScoreTemplate
from deepeval.metrics.image_editing.template import ImageEditingTemplate
from deepeval.utils import get_or_create_event_loop
from deepeval.metrics.utils import (
construct_verbose_logs,
Expand All @@ -14,8 +14,7 @@
initialize_multimodal_model,
)
from deepeval.models import DeepEvalBaseMLLM
from deepeval.metrics.viescore.schema import ReasonScore
from deepeval.metrics.viescore.task import VIEScoreTask
from deepeval.metrics.image_editing.schema import ReasonScore
from deepeval.metrics.indicator import metric_progress_indicator

required_params: List[MLLMTestCaseParams] = [
Expand All @@ -24,33 +23,26 @@
]


class VIEScore(BaseMultimodalMetric):
class ImageEditingMetric(BaseMultimodalMetric):
def __init__(
self,
model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
task: VIEScoreTask = VIEScoreTask.TEXT_TO_IMAGE_GENERATION,
threshold: float = 0.5,
async_mode: bool = True,
strict_mode: bool = False,
verbose_mode: bool = False,
_include_VIEScore_task_name: bool = True,
):
self.model, self.using_native_model = initialize_multimodal_model(model)
self.evaluation_model = self.model.get_model_name()
self.threshold = 1 if strict_mode else threshold
self.strict_mode = strict_mode
self.async_mode = async_mode
self.verbose_mode = verbose_mode
self.task = task
self._include_VIEScore_task_name = _include_VIEScore_task_name

def measure(
self, test_case: MLLMTestCase, _show_indicator: bool = True
) -> float:
if self.task == VIEScoreTask.TEXT_TO_IMAGE_GENERATION:
check_mllm_test_case_params(test_case, required_params, 0, 1, self)
elif self.task == VIEScoreTask.TEXT_TO_IMAGE_EDITING:
check_mllm_test_case_params(test_case, required_params, 1, 1, self)
check_mllm_test_case_params(test_case, required_params, 1, 1, self)

self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(self, _show_indicator=_show_indicator):
Expand Down Expand Up @@ -102,10 +94,7 @@ async def a_measure(
test_case: MLLMTestCase,
_show_indicator: bool = True,
) -> float:
if self.task == VIEScoreTask.TEXT_TO_IMAGE_GENERATION:
check_mllm_test_case_params(test_case, required_params, 0, 1, self)
elif self.task == VIEScoreTask.TEXT_TO_IMAGE_EDITING:
check_mllm_test_case_params(test_case, required_params, 1, 1, self)
check_mllm_test_case_params(test_case, required_params, 1, 1, self)

self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(
Expand Down Expand Up @@ -169,13 +158,10 @@ async def _a_evaluate_semantic_consistency(
actual_image_output: MLLMImage,
) -> Tuple[List[int], str]:
images: List[MLLMImage] = []
if self.task == VIEScoreTask.TEXT_TO_IMAGE_GENERATION:
images.append(image_input)
elif self.task == VIEScoreTask.TEXT_TO_IMAGE_EDITING:
images.extend([image_input, actual_image_output])
images.extend([image_input, actual_image_output])
prompt = [
VIEScoreTemplate.generate_semantic_consistency_evaluation_results(
text_prompt=text_prompt, task=self.task
ImageEditingTemplate.generate_semantic_consistency_evaluation_results(
text_prompt=text_prompt
)
]
if self.using_native_model:
Expand Down Expand Up @@ -203,13 +189,10 @@ def _evaluate_semantic_consistency(
actual_image_output: MLLMImage,
) -> Tuple[List[int], str]:
images: List[MLLMImage] = []
if self.task == VIEScoreTask.TEXT_TO_IMAGE_GENERATION:
images.append(image_input)
elif self.task == VIEScoreTask.TEXT_TO_IMAGE_EDITING:
images.extend([image_input, actual_image_output])
images.extend([image_input, actual_image_output])
prompt = [
VIEScoreTemplate.generate_semantic_consistency_evaluation_results(
text_prompt=text_prompt, task=self.task
ImageEditingTemplate.generate_semantic_consistency_evaluation_results(
text_prompt=text_prompt
)
]
if self.using_native_model:
Expand All @@ -233,7 +216,7 @@ async def _a_evaluate_perceptual_quality(
) -> Tuple[List[int], str]:
images: List[MLLMImage] = [actual_image_output]
prompt = [
VIEScoreTemplate.generate_perceptual_quality_evaluation_results()
ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
]
if self.using_native_model:
res, cost = await self.model.a_generate(prompt + images)
Expand All @@ -256,7 +239,7 @@ def _evaluate_perceptual_quality(
) -> Tuple[List[int], str]:
images: List[MLLMImage] = [actual_image_output]
prompt = [
VIEScoreTemplate.generate_perceptual_quality_evaluation_results()
ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
]
if self.using_native_model:
res, cost = self.model.generate(prompt + images)
Expand Down Expand Up @@ -304,7 +287,4 @@ def _generate_reason(

@property
def __name__(self):
if self._include_VIEScore_task_name:
return f"{self.task.value} (VIEScore)"
else:
return "VIEScore"
return "Image Editing"
File renamed without changes.
65 changes: 65 additions & 0 deletions deepeval/metrics/image_editing/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import textwrap


class ImageEditingTemplate:

context = textwrap.dedent(
"""
You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
You will have to give your output in this way (Keep your reasoning concise and short.):
{
"score" : [...],
"reasoning" : "..."
}
"""
)

@staticmethod
def generate_semantic_consistency_evaluation_results(
text_prompt: str
):
return textwrap.dedent(
f"""
{ImageEditingTemplate.context}
RULES:
Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
The objective is to evaluate how successfully the editing instruction has been executed in the second image.
From scale 0 to 10:
A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
Editing instruction: {text_prompt}
"""
)

@staticmethod
def generate_perceptual_quality_evaluation_results():
return textwrap.dedent(
f"""
{ImageEditingTemplate.context}
RULES:
The image is an AI-generated image.
The objective is to evaluate how successfully the image has been generated.
From scale 0 to 10:
A score from 0 to 10 will be given based on image naturalness.
(
0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting.
10 indicates that the image looks natural.
)
A second score from 0 to 10 will rate the image artifacts.
(
0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized.
10 indicates the image has no artifacts.
)
Put the score in a list such that output score = [naturalness, artifacts]
"""
)
Empty file.
7 changes: 7 additions & 0 deletions deepeval/metrics/text_to_image/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from typing import List
from pydantic import BaseModel, Field


class ReasonScore(BaseModel):
reasoning: str
score: List[float]
66 changes: 66 additions & 0 deletions deepeval/metrics/text_to_image/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import textwrap


class TextToImageTemplate:

context = textwrap.dedent(
"""
You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
You will have to give your output in this way (Keep your reasoning concise and short.):
{
"score" : [...],
"reasoning" : "..."
}
"""
)

@staticmethod
def generate_semantic_consistency_evaluation_results(
text_prompt: str
):
return textwrap.dedent(
f"""
{TextToImageTemplate.context}
RULES:
The image is an AI-generated image according to the text prompt.
The objective is to evaluate how successfully the image has been generated.
From scale 0 to 10:
A score from 0 to 10 will be given based on the success in following the prompt.
(0 indicates that the AI generated image does not follow the prompt at all. 10 indicates the AI generated image follows the prompt perfectly.)
Put the score in a list such that output score = [score].
Text Prompt: {text_prompt}
"""
)

@staticmethod
def generate_perceptual_quality_evaluation_results():
return textwrap.dedent(
f"""
{TextToImageTemplate.context}
RULES:
The image is an AI-generated image.
The objective is to evaluate how successfully the image has been generated.
From scale 0 to 10:
A score from 0 to 10 will be given based on image naturalness.
(
0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting.
10 indicates that the image looks natural.
)
A second score from 0 to 10 will rate the image artifacts.
(
0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized.
10 indicates the image has no artifacts.
)
Put the score in a list such that output score = [naturalness, artifacts]
"""
)
Loading

0 comments on commit b63beed

Please sign in to comment.