-
Notifications
You must be signed in to change notification settings - Fork 232
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add e2e test for running tracked function * Replace low level api client usage with Opik class usage in e2e tests. Add test for dataset creation and population. * Add missing files, add e2e test for dataset * Add comparison for the amount of dataset items * Refactor testlib * Refactor e2e tests, implement new feedback tests and experiment test * Remove debug statement from opik.__init__, add evaluate to __all__ * Update experiment e2e test, add EvaluationResult object as return value of evaluate * Make e2e tests run agains any cpnfigured backend * Fix import error
- Loading branch information
1 parent
7ab906b
commit 81dee5e
Showing
25 changed files
with
967 additions
and
209 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from typing import List | ||
|
||
import dataclasses | ||
|
||
from . import test_result | ||
|
||
|
||
@dataclasses.dataclass | ||
class EvaluationResult: | ||
experiment_id: str | ||
experiment_name: str | ||
test_results: List[test_result.TestResult] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import os | ||
import random | ||
import string | ||
|
||
import opik | ||
import opik.api_objects.opik_client | ||
|
||
import pytest | ||
|
||
|
||
def _random_chars(n: int = 6) -> str: | ||
return "".join(random.choice(string.ascii_letters) for _ in range(n)) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def configure_e2e_tests_env(): | ||
os.environ["OPIK_PROJECT_NAME"] = "e2e-tests" | ||
# os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" | ||
|
||
|
||
@pytest.fixture() | ||
def opik_client(configure_e2e_tests_env, shutdown_cached_client_after_test): | ||
opik_client_ = opik.api_objects.opik_client.Opik() | ||
|
||
yield opik_client_ | ||
|
||
opik_client_.end() | ||
|
||
|
||
@pytest.fixture | ||
def dataset_name(opik_client: opik.Opik): | ||
name = f"e2e-tests-dataset-{ _random_chars()}" | ||
yield name | ||
|
||
opik_client.delete_dataset(name) | ||
|
||
|
||
@pytest.fixture | ||
def experiment_name(opik_client: opik.Opik): | ||
name = f"e2e-tests-experiment-{ _random_chars()}" | ||
yield name | ||
|
||
# TODO: delete the experiment |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import opik | ||
from . import verifiers | ||
from opik.api_objects.dataset import dataset_item | ||
|
||
|
||
def test_create_and_populate_dataset__happyflow( | ||
opik_client: opik.Opik, dataset_name: str | ||
): | ||
DESCRIPTION = "E2E test dataset" | ||
|
||
dataset = opik_client.create_dataset(dataset_name, description=DESCRIPTION) | ||
|
||
dataset.insert( | ||
[ | ||
{ | ||
"input": {"question": "What is the of capital of France?"}, | ||
"expected_output": {"output": "Paris"}, | ||
}, | ||
{ | ||
"input": {"question": "What is the of capital of Germany?"}, | ||
"expected_output": {"output": "Berlin"}, | ||
}, | ||
{ | ||
"input": {"question": "What is the of capital of Poland?"}, | ||
"expected_output": {"output": "Warsaw"}, | ||
}, | ||
] | ||
) | ||
|
||
EXPECTED_DATASET_ITEMS = [ | ||
dataset_item.DatasetItem( | ||
input={"question": "What is the of capital of France?"}, | ||
expected_output={"output": "Paris"}, | ||
), | ||
dataset_item.DatasetItem( | ||
input={"question": "What is the of capital of Germany?"}, | ||
expected_output={"output": "Berlin"}, | ||
), | ||
dataset_item.DatasetItem( | ||
input={"question": "What is the of capital of Poland?"}, | ||
expected_output={"output": "Warsaw"}, | ||
), | ||
] | ||
|
||
verifiers.verify_dataset( | ||
opik_client=opik_client, | ||
name=dataset_name, | ||
description=DESCRIPTION, | ||
dataset_items=EXPECTED_DATASET_ITEMS, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import opik | ||
|
||
from opik.api_objects.dataset import dataset_item | ||
from opik.evaluation import metrics | ||
from . import verifiers | ||
|
||
|
||
def test_experiment_creation_via_evaluate_function__happyflow( | ||
opik_client: opik.Opik, dataset_name: str, experiment_name: str | ||
): | ||
# TODO: this test is not finished, it only checks that the script is not failing | ||
|
||
dataset = opik_client.create_dataset(dataset_name) | ||
|
||
dataset.insert( | ||
[ | ||
{ | ||
"input": {"question": "What is the of capital of France?"}, | ||
"expected_output": {"output": "Paris"}, | ||
}, | ||
{ | ||
"input": {"question": "What is the of capital of Germany?"}, | ||
"expected_output": {"output": "Berlin"}, | ||
}, | ||
{ | ||
"input": {"question": "What is the of capital of Poland?"}, | ||
"expected_output": {"output": "Warsaw"}, | ||
}, | ||
] | ||
) | ||
|
||
def task(item: dataset_item.DatasetItem): | ||
if item.input == {"question": "What is the of capital of France?"}: | ||
return {"output": "Paris", "reference": item.expected_output["output"]} | ||
if item.input == {"question": "What is the of capital of Germany?"}: | ||
return {"output": "Berlin", "reference": item.expected_output["output"]} | ||
if item.input == {"question": "What is the of capital of Poland?"}: | ||
return {"output": "Krakow", "reference": item.expected_output["output"]} | ||
|
||
raise AssertionError( | ||
f"Task received dataset item with an unexpected input: {item.input}" | ||
) | ||
|
||
equals_metric = metrics.Equals() | ||
evaluation_result = opik.evaluate( | ||
dataset=dataset, | ||
task=task, | ||
scoring_metrics=[equals_metric], | ||
experiment_name=experiment_name, | ||
) | ||
|
||
opik.flush_tracker() | ||
|
||
verifiers.verify_experiment( | ||
opik_client=opik_client, | ||
id=evaluation_result.experiment_id, | ||
experiment_name=evaluation_result.experiment_name, | ||
traces_amount=3, # one trace per dataset item | ||
feedback_scores_amount=1, # an average value of all Equals metric scores | ||
) | ||
|
||
# TODO: check more content of the experiment | ||
# | ||
# EXPECTED_DATASET_ITEMS = [ | ||
# dataset_item.DatasetItem( | ||
# input={"question": "What is the of capital of France?"}, | ||
# expected_output={"output": "Paris"}, | ||
# ), | ||
# dataset_item.DatasetItem( | ||
# input={"question": "What is the of capital of Germany?"}, | ||
# expected_output={"output": "Berlin"}, | ||
# ), | ||
# dataset_item.DatasetItem( | ||
# input={"question": "What is the of capital of Poland?"}, | ||
# expected_output={"output": "Warsaw"}, | ||
# ), | ||
# ] |
Oops, something went wrong.