diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py index 9f0ecfdea1..e970cd9aec 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py @@ -27,7 +27,8 @@ class AnswerRelevance(base_metric.BaseMetric): between 0.0 and 1.0, where higher values indicate better answer relevance. Args: - model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance. + model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance. + `opik.evaluation.models.LiteLLMChatModel` is used by default. name: The name of the metric. Defaults to "AnswerRelevanceMetric". Example: diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py index 56e15bee08..bd94911fa6 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py @@ -26,7 +26,8 @@ class ContextPrecision(base_metric.BaseMetric): where higher values indicate better context precision. Args: - model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance. + model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance. + `opik.evaluation.models.LiteLLMChatModel` is used by default. name: The name of the metric. Defaults to "context_precision_metric". few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples. diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py index d6bbf94fb3..1d91124c2a 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py @@ -27,7 +27,8 @@ class ContextRecall(base_metric.BaseMetric): where higher values indicate better context recall. Args: - model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance. + model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance. + `opik.evaluation.models.LiteLLMChatModel` is used by default. name: The name of the metric. Defaults to "ContextRecallMetric". few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples. diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py index b49404d013..e992275a1d 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py @@ -30,7 +30,8 @@ class Factuality(base_metric.BaseMetric): where higher values indicate higher factual accuracy. Args: - model: The language model to use for factuality assessment. Can be a string (model name) or a CometBaseModel instance. + model: The language model to use for factuality assessment. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance. + `opik.evaluation.models.LiteLLMChatModel` is used by default. name: The name of the metric. Defaults to "FactualityMetric". few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used. diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py index b9ff0468aa..24ff0c6827 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py @@ -26,6 +26,19 @@ def __init__( model: Optional[Union[str, base_model.OpikBaseModel]] = None, name: str = "g_eval_metric", ): + """ + A metric that evaluates an LLM output based on chain-of-thought built with the evaluation criteria provided + by the user. + + For more details see the original paper: https://arxiv.org/pdf/2303.16634 + + Args: + task_introduction: An instruction for LLM used to generate an evaluation chain-of-thought and in evaluation call itself. + `opik.evaluation.models.LiteLLMChatModel` is used by default. + evaluation_critera: The main task for G-Eval metric written in human language. + model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance. + name: The name of the metric. + """ super().__init__( name=name, ) @@ -55,14 +68,25 @@ def _init_model( def score( self, - input: str, + output: str, **ignored_kwargs: Any, ) -> score_result.ScoreResult: + """ + Calculate the G-Eval score for the given LLM's output. + + Args: + output: The LLM's output to evaluate. + **ignored_kwargs: Additional keyword arguments that are ignored. + + Returns: + score_result.ScoreResult: A ScoreResult object containing the G-Eval score + (between 0.0 and 1.0) and a reason for the score. + """ llm_query = G_EVAL_QUERY_TEMPLATE.format( task_introduction=self.task_introduction, evaluation_criteria=self.evaluation_criteria, chain_of_thought=self.llm_chain_of_thought, - input=input, + input=output, ) request = [ @@ -82,13 +106,24 @@ def score( return self._parse_model_output(model_output) async def ascore( - self, input: str, **ignored_kwargs: Any + self, output: str, **ignored_kwargs: Any ) -> score_result.ScoreResult: + """ + Calculate the G-Eval score for the given LLM's output. + + Args: + output: The LLM's output to evaluate. + **ignored_kwargs: Additional keyword arguments that are ignored. + + Returns: + score_result.ScoreResult: A ScoreResult object containing the G-Eval score + (between 0.0 and 1.0) and a reason for the score. + """ llm_query = G_EVAL_QUERY_TEMPLATE.format( task_introduction=self.task_introduction, evaluation_criteria=self.evaluation_criteria, chain_of_thought=self.llm_chain_of_thought, - input=input, + input=output, ) request = [ diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py index 1c61f0fea2..10f119836e 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py @@ -26,7 +26,8 @@ class Hallucination(base_metric.BaseMetric): It returns a score of 1.0 if hallucination is detected, and 0.0 otherwise. Args: - model: The LLM to use for evaluation. Can be a string (model name) or a CometBaseModel instance. + model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance. + `opik.evaluation.models.LiteLLMChatModel` is used by default. name: The name of the metric. few_shot_examples: A list of few-shot examples to use for hallucination detection. If None, default examples will be used. diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py index 5ff376b05d..543118b95f 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py @@ -24,7 +24,8 @@ class Moderation(base_metric.BaseMetric): It returns a score between 0.0 and 1.0, where higher values indicate more appropriate content. Args: - model: The language model to use for moderation. Can be a string (model name) or a CometBaseModel instance. + model: The language model to use for moderation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance. + `opik.evaluation.models.LiteLLMChatModel` is used by default. name: The name of the metric. Defaults to "moderation_metric". few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used. diff --git a/sdks/python/src/opik/evaluation/models/__init__.py b/sdks/python/src/opik/evaluation/models/__init__.py index e69de29bb2..45bf41d93d 100644 --- a/sdks/python/src/opik/evaluation/models/__init__.py +++ b/sdks/python/src/opik/evaluation/models/__init__.py @@ -0,0 +1,7 @@ +from .base_model import OpikBaseModel +from .litellm_chat_model import LiteLLMChatModel + +__all__ = [ + "OpikBaseModel", + "LiteLLMChatModel", +] diff --git a/sdks/python/src/opik/evaluation/models/base_model.py b/sdks/python/src/opik/evaluation/models/base_model.py index 569c788172..c197ca4c3b 100644 --- a/sdks/python/src/opik/evaluation/models/base_model.py +++ b/sdks/python/src/opik/evaluation/models/base_model.py @@ -5,6 +5,9 @@ class OpikBaseModel(abc.ABC): """ This class serves as an interface to LLMs. + + If you want to implement custom LLM provider in evaluation metrics, + you should inherit from this class. """ def __init__(self, model_name: str): diff --git a/sdks/python/src/opik/evaluation/models/litellm_chat_model.py b/sdks/python/src/opik/evaluation/models/litellm_chat_model.py index aff83f0146..16c3eb0f63 100644 --- a/sdks/python/src/opik/evaluation/models/litellm_chat_model.py +++ b/sdks/python/src/opik/evaluation/models/litellm_chat_model.py @@ -20,10 +20,15 @@ def __init__( ) -> None: """ Initializes the base model with a given model name. - You can find all possible completion_kwargs parameters here: https://docs.litellm.ai/docs/completion/input + Wraps `litellm.completion` function. + You can find all possible completion_kwargs parameters here: https://docs.litellm.ai/docs/completion/input. Args: model_name: The name of the LLM model to be used. + must_support_arguments: A list of arguments that the provider must support. + `litellm.get_supported_openai_params(model_name)` call is used to get + supported arguments. If any is missing, ValueError is raised. + **completion_kwargs: key-value arguments to always pass additionally into `litellm.completion` function. """ super().__init__(model_name=model_name)