Skip to content

Commit

Permalink
[NA] improve models docstrings and expose them to public api (#688)
Browse files Browse the repository at this point in the history
* Update the docstrings, add model classes to evaluation.metrics.__init__.__all__

* GEval metric - fix input arguments, provide missing docstrings

* Update docstrings

* Update docstring
  • Loading branch information
alexkuzmik authored Nov 21, 2024
1 parent 59ad1d9 commit 6941b44
Show file tree
Hide file tree
Showing 10 changed files with 67 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ class AnswerRelevance(base_metric.BaseMetric):
between 0.0 and 1.0, where higher values indicate better answer relevance.
Args:
model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance.
model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
`opik.evaluation.models.LiteLLMChatModel` is used by default.
name: The name of the metric. Defaults to "AnswerRelevanceMetric".
Example:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class ContextPrecision(base_metric.BaseMetric):
where higher values indicate better context precision.
Args:
model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance.
model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
`opik.evaluation.models.LiteLLMChatModel` is used by default.
name: The name of the metric. Defaults to "context_precision_metric".
few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ class ContextRecall(base_metric.BaseMetric):
where higher values indicate better context recall.
Args:
model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance.
model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
`opik.evaluation.models.LiteLLMChatModel` is used by default.
name: The name of the metric. Defaults to "ContextRecallMetric".
few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ class Factuality(base_metric.BaseMetric):
where higher values indicate higher factual accuracy.
Args:
model: The language model to use for factuality assessment. Can be a string (model name) or a CometBaseModel instance.
model: The language model to use for factuality assessment. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
`opik.evaluation.models.LiteLLMChatModel` is used by default.
name: The name of the metric. Defaults to "FactualityMetric".
few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,19 @@ def __init__(
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
name: str = "g_eval_metric",
):
"""
A metric that evaluates an LLM output based on chain-of-thought built with the evaluation criteria provided
by the user.
For more details see the original paper: https://arxiv.org/pdf/2303.16634
Args:
task_introduction: An instruction for LLM used to generate an evaluation chain-of-thought and in evaluation call itself.
`opik.evaluation.models.LiteLLMChatModel` is used by default.
evaluation_critera: The main task for G-Eval metric written in human language.
model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
name: The name of the metric.
"""
super().__init__(
name=name,
)
Expand Down Expand Up @@ -55,14 +68,25 @@ def _init_model(

def score(
self,
input: str,
output: str,
**ignored_kwargs: Any,
) -> score_result.ScoreResult:
"""
Calculate the G-Eval score for the given LLM's output.
Args:
output: The LLM's output to evaluate.
**ignored_kwargs: Additional keyword arguments that are ignored.
Returns:
score_result.ScoreResult: A ScoreResult object containing the G-Eval score
(between 0.0 and 1.0) and a reason for the score.
"""
llm_query = G_EVAL_QUERY_TEMPLATE.format(
task_introduction=self.task_introduction,
evaluation_criteria=self.evaluation_criteria,
chain_of_thought=self.llm_chain_of_thought,
input=input,
input=output,
)

request = [
Expand All @@ -82,13 +106,24 @@ def score(
return self._parse_model_output(model_output)

async def ascore(
self, input: str, **ignored_kwargs: Any
self, output: str, **ignored_kwargs: Any
) -> score_result.ScoreResult:
"""
Calculate the G-Eval score for the given LLM's output.
Args:
output: The LLM's output to evaluate.
**ignored_kwargs: Additional keyword arguments that are ignored.
Returns:
score_result.ScoreResult: A ScoreResult object containing the G-Eval score
(between 0.0 and 1.0) and a reason for the score.
"""
llm_query = G_EVAL_QUERY_TEMPLATE.format(
task_introduction=self.task_introduction,
evaluation_criteria=self.evaluation_criteria,
chain_of_thought=self.llm_chain_of_thought,
input=input,
input=output,
)

request = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class Hallucination(base_metric.BaseMetric):
It returns a score of 1.0 if hallucination is detected, and 0.0 otherwise.
Args:
model: The LLM to use for evaluation. Can be a string (model name) or a CometBaseModel instance.
model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
`opik.evaluation.models.LiteLLMChatModel` is used by default.
name: The name of the metric.
few_shot_examples: A list of few-shot examples to use for hallucination detection. If None, default examples will be used.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ class Moderation(base_metric.BaseMetric):
It returns a score between 0.0 and 1.0, where higher values indicate more appropriate content.
Args:
model: The language model to use for moderation. Can be a string (model name) or a CometBaseModel instance.
model: The language model to use for moderation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
`opik.evaluation.models.LiteLLMChatModel` is used by default.
name: The name of the metric. Defaults to "moderation_metric".
few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
Expand Down
7 changes: 7 additions & 0 deletions sdks/python/src/opik/evaluation/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .base_model import OpikBaseModel
from .litellm_chat_model import LiteLLMChatModel

__all__ = [
"OpikBaseModel",
"LiteLLMChatModel",
]
3 changes: 3 additions & 0 deletions sdks/python/src/opik/evaluation/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
class OpikBaseModel(abc.ABC):
"""
This class serves as an interface to LLMs.
If you want to implement custom LLM provider in evaluation metrics,
you should inherit from this class.
"""

def __init__(self, model_name: str):
Expand Down
7 changes: 6 additions & 1 deletion sdks/python/src/opik/evaluation/models/litellm_chat_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@ def __init__(
) -> None:
"""
Initializes the base model with a given model name.
You can find all possible completion_kwargs parameters here: https://docs.litellm.ai/docs/completion/input
Wraps `litellm.completion` function.
You can find all possible completion_kwargs parameters here: https://docs.litellm.ai/docs/completion/input.
Args:
model_name: The name of the LLM model to be used.
must_support_arguments: A list of arguments that the provider must support.
`litellm.get_supported_openai_params(model_name)` call is used to get
supported arguments. If any is missing, ValueError is raised.
**completion_kwargs: key-value arguments to always pass additionally into `litellm.completion` function.
"""

super().__init__(model_name=model_name)
Expand Down

0 comments on commit 6941b44

Please sign in to comment.