diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py
index 9f0ecfdea1..e970cd9aec 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py
@@ -27,7 +27,8 @@ class AnswerRelevance(base_metric.BaseMetric):
     between 0.0 and 1.0, where higher values indicate better answer relevance.
 
     Args:
-        model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance.
+        model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
+            `opik.evaluation.models.LiteLLMChatModel` is used by default.
         name: The name of the metric. Defaults to "AnswerRelevanceMetric".
 
     Example:
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
index 56e15bee08..bd94911fa6 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
@@ -26,7 +26,8 @@ class ContextPrecision(base_metric.BaseMetric):
     where higher values indicate better context precision.
 
     Args:
-        model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance.
+        model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
+            `opik.evaluation.models.LiteLLMChatModel` is used by default.
         name: The name of the metric. Defaults to "context_precision_metric".
         few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
 
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
index d6bbf94fb3..1d91124c2a 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
@@ -27,7 +27,8 @@ class ContextRecall(base_metric.BaseMetric):
     where higher values indicate better context recall.
 
     Args:
-        model: The language model to use for evaluation. Can be a string (model name) or a CometBaseModel instance.
+        model: The language model to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
+            `opik.evaluation.models.LiteLLMChatModel` is used by default.
         name: The name of the metric. Defaults to "ContextRecallMetric".
         few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
 
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py
index b49404d013..e992275a1d 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py
@@ -30,7 +30,8 @@ class Factuality(base_metric.BaseMetric):
     where higher values indicate higher factual accuracy.
 
     Args:
-        model: The language model to use for factuality assessment. Can be a string (model name) or a CometBaseModel instance.
+        model: The language model to use for factuality assessment. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
+            `opik.evaluation.models.LiteLLMChatModel` is used by default.
         name: The name of the metric. Defaults to "FactualityMetric".
         few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
 
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py
index b9ff0468aa..24ff0c6827 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py
@@ -26,6 +26,19 @@ def __init__(
         model: Optional[Union[str, base_model.OpikBaseModel]] = None,
         name: str = "g_eval_metric",
     ):
+        """
+        A metric that evaluates an LLM output based on chain-of-thought built with the evaluation criteria provided
+        by the user.
+
+        For more details see the original paper: https://arxiv.org/pdf/2303.16634
+
+        Args:
+            task_introduction: An instruction for LLM used to generate an evaluation chain-of-thought and in evaluation call itself.
+                `opik.evaluation.models.LiteLLMChatModel` is used by default.
+            evaluation_critera: The main task for G-Eval metric written in human language.
+            model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
+            name: The name of the metric.
+        """
         super().__init__(
             name=name,
         )
@@ -55,14 +68,25 @@ def _init_model(
 
     def score(
         self,
-        input: str,
+        output: str,
         **ignored_kwargs: Any,
     ) -> score_result.ScoreResult:
+        """
+        Calculate the G-Eval score for the given LLM's output.
+
+        Args:
+            output: The LLM's output to evaluate.
+            **ignored_kwargs: Additional keyword arguments that are ignored.
+
+        Returns:
+            score_result.ScoreResult: A ScoreResult object containing the G-Eval score
+            (between 0.0 and 1.0) and a reason for the score.
+        """
         llm_query = G_EVAL_QUERY_TEMPLATE.format(
             task_introduction=self.task_introduction,
             evaluation_criteria=self.evaluation_criteria,
             chain_of_thought=self.llm_chain_of_thought,
-            input=input,
+            input=output,
         )
 
         request = [
@@ -82,13 +106,24 @@ def score(
         return self._parse_model_output(model_output)
 
     async def ascore(
-        self, input: str, **ignored_kwargs: Any
+        self, output: str, **ignored_kwargs: Any
     ) -> score_result.ScoreResult:
+        """
+        Calculate the G-Eval score for the given LLM's output.
+
+        Args:
+            output: The LLM's output to evaluate.
+            **ignored_kwargs: Additional keyword arguments that are ignored.
+
+        Returns:
+            score_result.ScoreResult: A ScoreResult object containing the G-Eval score
+            (between 0.0 and 1.0) and a reason for the score.
+        """
         llm_query = G_EVAL_QUERY_TEMPLATE.format(
             task_introduction=self.task_introduction,
             evaluation_criteria=self.evaluation_criteria,
             chain_of_thought=self.llm_chain_of_thought,
-            input=input,
+            input=output,
         )
 
         request = [
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
index 1c61f0fea2..10f119836e 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
@@ -26,7 +26,8 @@ class Hallucination(base_metric.BaseMetric):
     It returns a score of 1.0 if hallucination is detected, and 0.0 otherwise.
 
     Args:
-        model: The LLM to use for evaluation. Can be a string (model name) or a CometBaseModel instance.
+        model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
+            `opik.evaluation.models.LiteLLMChatModel` is used by default.
         name: The name of the metric.
         few_shot_examples: A list of few-shot examples to use for hallucination detection.  If None, default examples will be used.
 
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
index 5ff376b05d..543118b95f 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
@@ -24,7 +24,8 @@ class Moderation(base_metric.BaseMetric):
     It returns a score between 0.0 and 1.0, where higher values indicate more appropriate content.
 
     Args:
-        model: The language model to use for moderation. Can be a string (model name) or a CometBaseModel instance.
+        model: The language model to use for moderation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
+            `opik.evaluation.models.LiteLLMChatModel` is used by default.
         name: The name of the metric. Defaults to "moderation_metric".
         few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
 
diff --git a/sdks/python/src/opik/evaluation/models/__init__.py b/sdks/python/src/opik/evaluation/models/__init__.py
index e69de29bb2..45bf41d93d 100644
--- a/sdks/python/src/opik/evaluation/models/__init__.py
+++ b/sdks/python/src/opik/evaluation/models/__init__.py
@@ -0,0 +1,7 @@
+from .base_model import OpikBaseModel
+from .litellm_chat_model import LiteLLMChatModel
+
+__all__ = [
+    "OpikBaseModel",
+    "LiteLLMChatModel",
+]
diff --git a/sdks/python/src/opik/evaluation/models/base_model.py b/sdks/python/src/opik/evaluation/models/base_model.py
index 569c788172..c197ca4c3b 100644
--- a/sdks/python/src/opik/evaluation/models/base_model.py
+++ b/sdks/python/src/opik/evaluation/models/base_model.py
@@ -5,6 +5,9 @@
 class OpikBaseModel(abc.ABC):
     """
     This class serves as an interface to LLMs.
+
+    If you want to implement custom LLM provider in evaluation metrics,
+    you should inherit from this class.
     """
 
     def __init__(self, model_name: str):
diff --git a/sdks/python/src/opik/evaluation/models/litellm_chat_model.py b/sdks/python/src/opik/evaluation/models/litellm_chat_model.py
index aff83f0146..16c3eb0f63 100644
--- a/sdks/python/src/opik/evaluation/models/litellm_chat_model.py
+++ b/sdks/python/src/opik/evaluation/models/litellm_chat_model.py
@@ -20,10 +20,15 @@ def __init__(
     ) -> None:
         """
         Initializes the base model with a given model name.
-        You can find all possible completion_kwargs parameters here: https://docs.litellm.ai/docs/completion/input
+        Wraps `litellm.completion` function.
+        You can find all possible completion_kwargs parameters here: https://docs.litellm.ai/docs/completion/input.
 
         Args:
             model_name: The name of the LLM model to be used.
+            must_support_arguments: A list of arguments that the provider must support.
+                `litellm.get_supported_openai_params(model_name)` call is used to get
+                supported arguments. If any is missing, ValueError is raised.
+            **completion_kwargs: key-value arguments to always pass additionally into `litellm.completion` function.
         """
 
         super().__init__(model_name=model_name)