Skip to content

Commit

Permalink
Fix metadata verifier, add e2e test for evaluate_experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
alexkuzmik committed Nov 22, 2024
1 parent cb2545e commit 516b471
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 10 deletions.
3 changes: 2 additions & 1 deletion sdks/python/src/opik/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
from .configurator.configure import configure
from . import package_version
from .plugins.pytest.decorator import llm_unit
from .evaluation import evaluate
from .evaluation import evaluate, evaluate_experiment

_logging.setup()

__version__ = package_version.VERSION
__all__ = [
"__version__",
"evaluate",
"evaluate_experiment",
"track",
"flush_tracker",
"Opik",
Expand Down
85 changes: 83 additions & 2 deletions sdks/python/tests/e2e/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def task(item: Dict[str, Any]):
experiment_name=evaluation_result.experiment_name,
experiment_metadata={"model_name": "gpt-3.5"},
traces_amount=3, # one trace per dataset item
feedback_scores_amount=1, # an average value of all Equals metric scores
feedback_scores_amount=1,
prompt=prompt,
)
# TODO: check more content of the experiment
Expand Down Expand Up @@ -140,7 +140,7 @@ def task(item: dataset_item.DatasetItem):
experiment_name=evaluation_result.experiment_name,
experiment_metadata=None,
traces_amount=1, # one trace per dataset item
feedback_scores_amount=1, # an average value of all Equals metric scores
feedback_scores_amount=1,
)


Expand Down Expand Up @@ -254,3 +254,84 @@ def task(item: dataset_item.DatasetItem):
traces_amount=1,
feedback_scores_amount=0,
)


def test_evaluate_experiment__an_experiment_created_with_evaluate__then_new_scores_are_added_to_existing_experiment_items__amount_of_feedback_scores_increased(
opik_client: opik.Opik, dataset_name: str, experiment_name: str
):
dataset = opik_client.create_dataset(dataset_name)

dataset.insert(
[
{
"input": {"question": "What is the of capital of France?"},
"expected_model_output": {"output": "Paris"},
},
]
)

def task(item: Dict[str, Any]):
if item["input"] == {"question": "What is the of capital of France?"}:
return {
"output": "Paris",
"reference": item["expected_model_output"]["output"],
}


raise AssertionError(
f"Task received dataset item with an unexpected input: {item['input']}"
)

prompt = Prompt(
name=f"test-experiment-prompt-{_random_chars()}",
prompt=f"test-experiment-prompt-template-{_random_chars()}",
)

# Create the experiment first
evaluation_result = opik.evaluate(
dataset=dataset,
task=task,
scoring_metrics=[],
experiment_name=experiment_name,
experiment_config={
"model_name": "gpt-3.5",
},
prompt=prompt,
)
opik.flush_tracker()

verifiers.verify_experiment(
opik_client=opik_client,
id=evaluation_result.experiment_id,
experiment_name=evaluation_result.experiment_name,
experiment_metadata={
"model_name": "gpt-3.5",
},
traces_amount=1,
feedback_scores_amount=0,
prompt=prompt,
)

# Populate the existing experiment with a new feedback score
evaluation_result = opik.evaluate_experiment(
experiment_name=experiment_name,
scoring_metrics=[
metrics.Equals(name="metric1"),
metrics.Equals(name="metric2"),
metrics.Equals(name="metric3"),
],
)
opik.flush_tracker()

verifiers.verify_experiment(
opik_client=opik_client,
id=evaluation_result.experiment_id,
experiment_name=evaluation_result.experiment_name,
experiment_metadata={
"model_name": "gpt-3.5",
},
traces_amount=1,
feedback_scores_amount=3,
prompt=prompt,
)

13 changes: 6 additions & 7 deletions sdks/python/tests/e2e/verifiers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from copy import deepcopy
from typing import Optional, Dict, Any, List
import opik
import json
Expand Down Expand Up @@ -236,12 +235,12 @@ def verify_experiment(

def verify_experiment_metadata(
experiment_content: ExperimentPublic,
metadata: Dict,
):
experiment_metadata = deepcopy(experiment_content.metadata)
if experiment_metadata is None:
return
experiment_metadata.pop("prompt", None)
metadata: Optional[Dict[str, Any]],
):
experiment_metadata = experiment_content.metadata
if experiment_content.metadata is not None:
experiment_metadata = {**experiment_content.metadata}
experiment_metadata.pop("prompt", None)

assert experiment_metadata == metadata, f"{experiment_metadata} != {metadata}"

Expand Down

0 comments on commit 516b471

Please sign in to comment.