diff --git a/sdks/python/examples/manual_chain_building.py b/sdks/python/examples/manual_chain_building.py index 515a6a3852..aa171cf9e9 100644 --- a/sdks/python/examples/manual_chain_building.py +++ b/sdks/python/examples/manual_chain_building.py @@ -1,8 +1,10 @@ import opik +import os +os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" client = opik.Opik() -trace = client.trace(name="trace-name") +trace = client.trace() span1 = trace.span(name="span-1") span2 = span1.span(name="span-2") span2.end() diff --git a/sdks/python/src/opik/__init__.py b/sdks/python/src/opik/__init__.py index 490468b1a4..9845bcc657 100644 --- a/sdks/python/src/opik/__init__.py +++ b/sdks/python/src/opik/__init__.py @@ -7,12 +7,14 @@ from . import _logging from . import package_version from .plugins.pytest.decorator import llm_unit +from .evaluation import evaluate _logging.setup() __version__ = package_version.VERSION __all__ = [ "__version__", + "evaluate", "track", "flush_tracker", "Opik", diff --git a/sdks/python/src/opik/api_objects/experiment/experiment.py b/sdks/python/src/opik/api_objects/experiment/experiment.py index 782da46eac..c27b0b9dae 100644 --- a/sdks/python/src/opik/api_objects/experiment/experiment.py +++ b/sdks/python/src/opik/api_objects/experiment/experiment.py @@ -18,16 +18,16 @@ def __init__( dataset_name: str, rest_client: rest_api_client.OpikApi, ) -> None: - self._id = id - self._name = name - self._dataset_name = dataset_name + self.id = id + self.name = name + self.dataset_name = dataset_name self._rest_client = rest_client def insert(self, experiment_items: List[experiment_item.ExperimentItem]) -> None: rest_experiment_items = [ rest_experiment_item.ExperimentItem( id=item.id if item.id is not None else helpers.generate_id(), - experiment_id=self._id, + experiment_id=self.id, dataset_item_id=item.dataset_item_id, trace_id=item.trace_id, ) diff --git a/sdks/python/src/opik/api_objects/opik_client.py b/sdks/python/src/opik/api_objects/opik_client.py index 860f01e710..ecd578bc07 100644 --- a/sdks/python/src/opik/api_objects/opik_client.py +++ b/sdks/python/src/opik/api_objects/opik_client.py @@ -17,7 +17,7 @@ ) from ..message_processing import streamer_constructors, messages from ..rest_api import client as rest_api_client -from ..rest_api.types import dataset_public +from ..rest_api.types import dataset_public, trace_public, span_public from .. import datetime_helpers, config, httpx_client @@ -377,6 +377,12 @@ def flush(self, timeout: Optional[int] = None) -> None: timeout = timeout if timeout is not None else self._flush_timeout self._streamer.flush(timeout) + def get_trace_content(self, id: str) -> trace_public.TracePublic: + return self._rest_client.traces.get_trace_by_id(id) + + def get_span_content(self, id: str) -> span_public.SpanPublic: + return self._rest_client.spans.get_span_by_id(id) + @functools.lru_cache() def get_client_cached() -> Opik: diff --git a/sdks/python/src/opik/evaluation/evaluation_result.py b/sdks/python/src/opik/evaluation/evaluation_result.py new file mode 100644 index 0000000000..09a8108645 --- /dev/null +++ b/sdks/python/src/opik/evaluation/evaluation_result.py @@ -0,0 +1,12 @@ +from typing import List + +import dataclasses + +from . import test_result + + +@dataclasses.dataclass +class EvaluationResult: + experiment_id: str + experiment_name: str + test_results: List[test_result.TestResult] diff --git a/sdks/python/src/opik/evaluation/evaluator.py b/sdks/python/src/opik/evaluation/evaluator.py index 2582ba83f9..94c1e6dfff 100644 --- a/sdks/python/src/opik/evaluation/evaluator.py +++ b/sdks/python/src/opik/evaluation/evaluator.py @@ -7,7 +7,7 @@ from ..api_objects.experiment import experiment_item from ..api_objects import opik_client -from . import tasks_scorer, test_result, scores_logger, report +from . import tasks_scorer, scores_logger, report, evaluation_result def evaluate( @@ -17,8 +17,7 @@ def evaluate( experiment_name: str, verbose: int = 1, task_threads: int = 16, - scoring_threads: int = 16, -) -> List[test_result.TestResult]: +) -> evaluation_result.EvaluationResult: """ Performs task evaluation on a given dataset. @@ -38,11 +37,6 @@ def evaluate( threads are created, all tasks executed in the current thread sequentially. are executed sequentially in the current thread. Use more than 1 worker if your task object is compatible with sharing across threads. - - scoring_threads: amount of thread workers to compute metric scores. If set to 1, - no additional threads are created, all metrics are computed in the - current thread sequentially. - Use more than 1 worker if your metrics are compatible with sharing across threads. """ client = opik_client.get_client_cached() start_time = time.time() @@ -77,4 +71,10 @@ def evaluate( experiment.insert(experiment_items=experiment_items) client.flush() - return test_results + + evaluation_result_ = evaluation_result.EvaluationResult( + experiment_id=experiment.id, + experiment_name=experiment.name, + test_results=test_results, + ) + return evaluation_result_ diff --git a/sdks/python/src/opik/synchronization.py b/sdks/python/src/opik/synchronization.py index 2bc4cf9b53..c61dc5b7b1 100644 --- a/sdks/python/src/opik/synchronization.py +++ b/sdks/python/src/opik/synchronization.py @@ -1,9 +1,13 @@ import time -from typing import Callable, Optional +import logging +from typing import Callable, Optional, Any + + +LOGGER = logging.getLogger(__name__) def wait_for_done( - check_function: Callable, + check_function: Callable[[], bool], timeout: Optional[float], progress_callback: Optional[Callable] = None, sleep_time: float = 1, @@ -19,3 +23,44 @@ def wait_for_done( end_sleep_time = time.time() + sleep_time while check_function() is False and time.time() < end_sleep_time: time.sleep(sleep_time / 20.0) + + +def until( + function: Callable[[], bool], + sleep: float = 0.5, + max_try_seconds: float = 10, + allow_errors: bool = False, +) -> bool: + start_time = time.time() + while True: + try: + if function(): + break + except Exception: + LOGGER.debug( + f"{function.__name__} raised error in 'until' function.", exc_info=True + ) + if not allow_errors: + raise + finally: + if (time.time() - start_time) > max_try_seconds: + return False + time.sleep(sleep) + return True + + +def try_get_until( + function: Callable[[], Any], sleep: float = 0.5, max_try_seconds: float = 10 +) -> Any: + """ + As soon + """ + start_time = time.time() + while True: + try: + return function() + except Exception: + if (time.time() - start_time) > max_try_seconds: + raise + time.sleep(sleep) + return True diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py index 9ae6f18c18..26358f8ba9 100644 --- a/sdks/python/tests/conftest.py +++ b/sdks/python/tests/conftest.py @@ -1,7 +1,7 @@ import pytest from opik import context_storage from opik.api_objects import opik_client -from .testlib import fake_message_processor +from .testlib import backend_emulator_message_processor from opik.message_processing import streamer_constructors @@ -12,7 +12,7 @@ def clear_context_storage(): @pytest.fixture(autouse=True) -def shutdown_cached_client(): +def shutdown_cached_client_after_test(): yield if opik_client.get_client_cached.cache_info().currsize > 0: opik_client.get_client_cached().end() @@ -22,7 +22,9 @@ def shutdown_cached_client(): @pytest.fixture def fake_streamer(): try: - fake_message_processor_ = fake_message_processor.FakeMessageProcessor() + fake_message_processor_ = ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor() + ) streamer = streamer_constructors.construct_streamer( message_processor=fake_message_processor_, n_consumers=1, diff --git a/sdks/python/tests/e2e/__init__.py b/sdks/python/tests/e2e/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdks/python/tests/e2e/conftest.py b/sdks/python/tests/e2e/conftest.py new file mode 100644 index 0000000000..ebdbe52262 --- /dev/null +++ b/sdks/python/tests/e2e/conftest.py @@ -0,0 +1,43 @@ +import os +import random +import string + +import opik +import opik.api_objects.opik_client + +import pytest + + +def _random_chars(n: int = 6) -> str: + return "".join(random.choice(string.ascii_letters) for _ in range(n)) + + +@pytest.fixture(scope="session") +def configure_e2e_tests_env(): + os.environ["OPIK_PROJECT_NAME"] = "e2e-tests" + # os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" + + +@pytest.fixture() +def opik_client(configure_e2e_tests_env, shutdown_cached_client_after_test): + opik_client_ = opik.api_objects.opik_client.Opik() + + yield opik_client_ + + opik_client_.end() + + +@pytest.fixture +def dataset_name(opik_client: opik.Opik): + name = f"e2e-tests-dataset-{ _random_chars()}" + yield name + + opik_client.delete_dataset(name) + + +@pytest.fixture +def experiment_name(opik_client: opik.Opik): + name = f"e2e-tests-experiment-{ _random_chars()}" + yield name + + # TODO: delete the experiment diff --git a/sdks/python/tests/e2e/test_dataset.py b/sdks/python/tests/e2e/test_dataset.py new file mode 100644 index 0000000000..cb4fe16bdc --- /dev/null +++ b/sdks/python/tests/e2e/test_dataset.py @@ -0,0 +1,50 @@ +import opik +from . import verifiers +from opik.api_objects.dataset import dataset_item + + +def test_create_and_populate_dataset__happyflow( + opik_client: opik.Opik, dataset_name: str +): + DESCRIPTION = "E2E test dataset" + + dataset = opik_client.create_dataset(dataset_name, description=DESCRIPTION) + + dataset.insert( + [ + { + "input": {"question": "What is the of capital of France?"}, + "expected_output": {"output": "Paris"}, + }, + { + "input": {"question": "What is the of capital of Germany?"}, + "expected_output": {"output": "Berlin"}, + }, + { + "input": {"question": "What is the of capital of Poland?"}, + "expected_output": {"output": "Warsaw"}, + }, + ] + ) + + EXPECTED_DATASET_ITEMS = [ + dataset_item.DatasetItem( + input={"question": "What is the of capital of France?"}, + expected_output={"output": "Paris"}, + ), + dataset_item.DatasetItem( + input={"question": "What is the of capital of Germany?"}, + expected_output={"output": "Berlin"}, + ), + dataset_item.DatasetItem( + input={"question": "What is the of capital of Poland?"}, + expected_output={"output": "Warsaw"}, + ), + ] + + verifiers.verify_dataset( + opik_client=opik_client, + name=dataset_name, + description=DESCRIPTION, + dataset_items=EXPECTED_DATASET_ITEMS, + ) diff --git a/sdks/python/tests/e2e/test_experiment.py b/sdks/python/tests/e2e/test_experiment.py new file mode 100644 index 0000000000..cc4cdcbb1e --- /dev/null +++ b/sdks/python/tests/e2e/test_experiment.py @@ -0,0 +1,77 @@ +import opik + +from opik.api_objects.dataset import dataset_item +from opik.evaluation import metrics +from . import verifiers + + +def test_experiment_creation_via_evaluate_function__happyflow( + opik_client: opik.Opik, dataset_name: str, experiment_name: str +): + # TODO: this test is not finished, it only checks that the script is not failing + + dataset = opik_client.create_dataset(dataset_name) + + dataset.insert( + [ + { + "input": {"question": "What is the of capital of France?"}, + "expected_output": {"output": "Paris"}, + }, + { + "input": {"question": "What is the of capital of Germany?"}, + "expected_output": {"output": "Berlin"}, + }, + { + "input": {"question": "What is the of capital of Poland?"}, + "expected_output": {"output": "Warsaw"}, + }, + ] + ) + + def task(item: dataset_item.DatasetItem): + if item.input == {"question": "What is the of capital of France?"}: + return {"output": "Paris", "reference": item.expected_output["output"]} + if item.input == {"question": "What is the of capital of Germany?"}: + return {"output": "Berlin", "reference": item.expected_output["output"]} + if item.input == {"question": "What is the of capital of Poland?"}: + return {"output": "Krakow", "reference": item.expected_output["output"]} + + raise AssertionError( + f"Task received dataset item with an unexpected input: {item.input}" + ) + + equals_metric = metrics.Equals() + evaluation_result = opik.evaluate( + dataset=dataset, + task=task, + scoring_metrics=[equals_metric], + experiment_name=experiment_name, + ) + + opik.flush_tracker() + + verifiers.verify_experiment( + opik_client=opik_client, + id=evaluation_result.experiment_id, + experiment_name=evaluation_result.experiment_name, + traces_amount=3, # one trace per dataset item + feedback_scores_amount=1, # an average value of all Equals metric scores + ) + + # TODO: check more content of the experiment + # + # EXPECTED_DATASET_ITEMS = [ + # dataset_item.DatasetItem( + # input={"question": "What is the of capital of France?"}, + # expected_output={"output": "Paris"}, + # ), + # dataset_item.DatasetItem( + # input={"question": "What is the of capital of Germany?"}, + # expected_output={"output": "Berlin"}, + # ), + # dataset_item.DatasetItem( + # input={"question": "What is the of capital of Poland?"}, + # expected_output={"output": "Warsaw"}, + # ), + # ] diff --git a/sdks/python/tests/e2e/test_feedback_scores.py b/sdks/python/tests/e2e/test_feedback_scores.py new file mode 100644 index 0000000000..0a0f295d37 --- /dev/null +++ b/sdks/python/tests/e2e/test_feedback_scores.py @@ -0,0 +1,137 @@ +from typing import List +import opik +from opik.types import FeedbackScoreDict +from . import verifiers + + +def test_feedbacks_are_logged_via_trace_and_span__happyflow(opik_client: opik.Opik): + trace = opik_client.trace( + name="trace-name", + ) + trace.log_feedback_score( + "trace-metric-1", value=0.5, category_name="category-1", reason="some-reason-1" + ) + trace.log_feedback_score( + "trace-metric-2", value=0.95, category_name="category-2", reason="some-reason-2" + ) + + span = trace.span( + name="span-name", + ) + span.log_feedback_score( + "span-metric-1", value=0.75, category_name="category-3", reason="some-reason-3" + ) + span.log_feedback_score( + "span-metric-2", value=0.25, category_name="category-4", reason="some-reason-4" + ) + span.end() + trace.end() + + opik_client.flush() + + EXPECTED_TRACE_FEEDBACK_SCORES: List[FeedbackScoreDict] = [ + { + "id": trace.id, + "name": "trace-metric-1", + "value": 0.5, + "category_name": "category-1", + "reason": "some-reason-1", + }, + { + "id": trace.id, + "name": "trace-metric-2", + "value": 0.95, + "category_name": "category-2", + "reason": "some-reason-2", + }, + ] + + EXPECTED_SPAN_FEEDBACK_SCORES: List[FeedbackScoreDict] = [ + { + "id": span.id, + "name": "span-metric-1", + "value": 0.75, + "category_name": "category-3", + "reason": "some-reason-3", + }, + { + "id": span.id, + "name": "span-metric-2", + "value": 0.25, + "category_name": "category-4", + "reason": "some-reason-4", + }, + ] + verifiers.verify_trace( + opik_client=opik_client, + trace_id=trace.id, + name="trace-name", + feedback_scores=EXPECTED_TRACE_FEEDBACK_SCORES, + ) + verifiers.verify_span( + opik_client=opik_client, + span_id=span.id, + trace_id=span.trace_id, + parent_span_id=None, + name="span-name", + feedback_scores=EXPECTED_SPAN_FEEDBACK_SCORES, + ) + + +def test_feedbacks_are_logged_via_client__happyflow(opik_client: opik.Opik): + trace = opik_client.trace(name="trace-name-1") + span = trace.span(name="span-name-1") + + EXPECTED_TRACE_FEEDBACK_SCORES: List[FeedbackScoreDict] = [ + { + "id": trace.id, + "name": "trace-metric-1", + "value": 0.5, + "category_name": "category-1", + "reason": "some-reason-1", + }, + { + "id": trace.id, + "name": "trace-metric-2", + "value": 0.95, + "category_name": "category-2", + "reason": "some-reason-2", + }, + ] + + EXPECTED_SPAN_FEEDBACK_SCORES: List[FeedbackScoreDict] = [ + { + "id": span.id, + "name": "span-metric-1", + "value": 0.75, + "category_name": "category-3", + "reason": "some-reason-3", + }, + { + "id": span.id, + "name": "span-metric-2", + "value": 0.25, + "category_name": "category-4", + "reason": "some-reason-4", + }, + ] + + opik_client.log_spans_feedback_scores(scores=EXPECTED_SPAN_FEEDBACK_SCORES) + opik_client.log_traces_feedback_scores(scores=EXPECTED_TRACE_FEEDBACK_SCORES) + + opik_client.flush() + + verifiers.verify_trace( + opik_client=opik_client, + trace_id=trace.id, + name="trace-name-1", + feedback_scores=EXPECTED_TRACE_FEEDBACK_SCORES, + ) + verifiers.verify_span( + opik_client=opik_client, + span_id=span.id, + trace_id=span.trace_id, + parent_span_id=None, + name="span-name-1", + feedback_scores=EXPECTED_SPAN_FEEDBACK_SCORES, + ) diff --git a/sdks/python/tests/e2e/test_tracing.py b/sdks/python/tests/e2e/test_tracing.py new file mode 100644 index 0000000000..eb0f240f17 --- /dev/null +++ b/sdks/python/tests/e2e/test_tracing.py @@ -0,0 +1,114 @@ +from . import verifiers +import opik + + +from opik import opik_context + + +def test_tracked_function__happyflow(opik_client): + # Setup + ID_STORAGE = {} + + @opik.track( + tags=["outer-tag1", "outer-tag2"], + metadata={"outer-metadata-key": "outer-metadata-value"}, + ) + def f_outer(x): + ID_STORAGE["f_outer-trace-id"] = opik_context.get_current_trace().id + ID_STORAGE["f_outer-span-id"] = opik_context.get_current_span().id + + f_inner("inner-input") + return "outer-output" + + @opik.track( + tags=["inner-tag1", "inner-tag2"], + metadata={"inner-metadata-key": "inner-metadata-value"}, + ) + def f_inner(y): + ID_STORAGE["f_inner-span-id"] = opik_context.get_current_span().id + return "inner-output" + + # Call + f_outer("outer-input") + opik.flush_tracker() + + # Verify trace + verifiers.verify_trace( + opik_client=opik_client, + trace_id=ID_STORAGE["f_outer-trace-id"], + name="f_outer", + input={"x": "outer-input"}, + output={"output": "outer-output"}, + metadata={"outer-metadata-key": "outer-metadata-value"}, + tags=["outer-tag1", "outer-tag2"], + ) + + # Verify top level span + verifiers.verify_span( + opik_client=opik_client, + span_id=ID_STORAGE["f_outer-span-id"], + parent_span_id=None, + trace_id=ID_STORAGE["f_outer-trace-id"], + name="f_outer", + input={"x": "outer-input"}, + output={"output": "outer-output"}, + metadata={"outer-metadata-key": "outer-metadata-value"}, + tags=["outer-tag1", "outer-tag2"], + ) + + # Verify nested span + verifiers.verify_span( + opik_client=opik_client, + span_id=ID_STORAGE["f_inner-span-id"], + parent_span_id=ID_STORAGE["f_outer-span-id"], + trace_id=ID_STORAGE["f_outer-trace-id"], + name="f_inner", + input={"y": "inner-input"}, + output={"output": "inner-output"}, + metadata={"inner-metadata-key": "inner-metadata-value"}, + tags=["inner-tag1", "inner-tag2"], + ) + + +def test_manually_created_trace_and_span__happyflow(opik_client: opik.Opik): + # Call + trace = opik_client.trace( + name="trace-name", + input={"input": "trace-input"}, + output={"output": "trace-output"}, + tags=["trace-tag"], + metadata={"trace-metadata-key": "trace-metadata-value"}, + ) + span = trace.span( + name="span-name", + input={"input": "span-input"}, + output={"output": "span-output"}, + tags=["span-tag"], + metadata={"span-metadata-key": "span-metadata-value"}, + ) + + opik_client.flush() + + # Verify trace + verifiers.verify_trace( + opik_client=opik_client, + trace_id=trace.id, + name="trace-name", + input={"input": "trace-input"}, + output={"output": "trace-output"}, + tags=["trace-tag"], + metadata={"trace-metadata-key": "trace-metadata-value"}, + ) + + # Verify span + verifiers.verify_span( + opik_client=opik_client, + span_id=span.id, + parent_span_id=None, + trace_id=span.trace_id, + name="span-name", + input={"input": "span-input"}, + output={"output": "span-output"}, + tags=["span-tag"], + metadata={"span-metadata-key": "span-metadata-value"}, + ) diff --git a/sdks/python/tests/e2e/verifiers.py b/sdks/python/tests/e2e/verifiers.py new file mode 100644 index 0000000000..ea9dae88f4 --- /dev/null +++ b/sdks/python/tests/e2e/verifiers.py @@ -0,0 +1,216 @@ +from typing import Optional, Dict, Any, List +import opik +import json + +from opik.types import FeedbackScoreDict +from opik.api_objects.dataset import dataset_item +from opik import synchronization + +from .. import testlib +import mock + + +def verify_trace( + opik_client: opik.Opik, + trace_id: str, + name: str = mock.ANY, # type: ignore + metadata: Dict[str, Any] = mock.ANY, # type: ignore + input: Dict[str, Any] = mock.ANY, # type: ignore + output: Dict[str, Any] = mock.ANY, # type: ignore + tags: List[str] = mock.ANY, # type: ignore + feedback_scores: List[FeedbackScoreDict] = mock.ANY, # type: ignore +): + if not synchronization.until( + lambda: (opik_client.get_trace_content(id=trace_id) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get trace with id {trace_id}.") + + trace = opik_client.get_trace_content(id=trace_id) + + assert trace.name == name, f"{trace.name} != {name}" + assert trace.input == input, testlib.prepare_difference_report(trace.input, input) + assert trace.output == output, testlib.prepare_difference_report( + trace.output, output + ) + assert trace.metadata == metadata, testlib.prepare_difference_report( + trace.metadata, metadata + ) + assert trace.tags == tags, testlib.prepare_difference_report(trace.tags, tags) + + if feedback_scores is not mock.ANY: + actual_feedback_scores = ( + [] if trace.feedback_scores is None else trace.feedback_scores + ) + assert ( + len(actual_feedback_scores) == len(feedback_scores) + ), f"Expected amount of trace feedback scores ({len(feedback_scores)}) is not equal to actual amount ({len(actual_feedback_scores)})" + + actual_feedback_scores: List[FeedbackScoreDict] = [ + { + "category_name": score.category_name, + "id": trace_id, + "name": score.name, + "reason": score.reason, + "value": score.value, + } + for score in trace.feedback_scores + ] + + sorted_actual_feedback_scores = sorted( + actual_feedback_scores, key=lambda item: json.dumps(item, sort_keys=True) + ) + sorted_expected_feedback_scores = sorted( + feedback_scores, key=lambda item: json.dumps(item, sort_keys=True) + ) + for actual_score, expected_score in zip( + sorted_actual_feedback_scores, sorted_expected_feedback_scores + ): + testlib.assert_dicts_equal(actual_score, expected_score) + + +def verify_span( + opik_client: opik.Opik, + span_id: str, + trace_id: str, + parent_span_id: Optional[str], + name: str = mock.ANY, # type: ignore + metadata: Dict[str, Any] = mock.ANY, # type: ignore + input: Dict[str, Any] = mock.ANY, # type: ignore + output: Dict[str, Any] = mock.ANY, # type: ignore + tags: List[str] = mock.ANY, # type: ignore + type: str = mock.ANY, # type: ignore + feedback_scores: List[FeedbackScoreDict] = mock.ANY, # type: ignore, +): + if not synchronization.until( + lambda: (opik_client.get_span_content(id=span_id) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get span with id {span_id}.") + + span = opik_client.get_span_content(id=span_id) + + assert span.trace_id == trace_id, f"{span.trace_id} != {trace_id}" + + if parent_span_id is None: + assert span.parent_span_id is None, f"{span.parent_span_id} != {parent_span_id}" + else: + assert ( + span.parent_span_id == parent_span_id + ), f"{span.parent_span_id} != {parent_span_id}" + + assert span.name == name, f"{span.name} != {name}" + assert span.type == type, f"{span.type} != {type}" + + assert span.input == input, testlib.prepare_difference_report(span.input, input) + assert span.output == output, testlib.prepare_difference_report(span.output, output) + assert span.metadata == metadata, testlib.prepare_difference_report( + span.metadata, metadata + ) + assert span.tags == tags, testlib.prepare_difference_report(span.tags, tags) + + if feedback_scores is not mock.ANY: + actual_feedback_scores = ( + [] if span.feedback_scores is None else span.feedback_scores + ) + assert ( + len(actual_feedback_scores) == len(feedback_scores) + ), f"Expected amount of span feedback scores ({len(feedback_scores)}) is not equal to actual amount ({len(actual_feedback_scores)})" + + actual_feedback_scores: List[FeedbackScoreDict] = [ + { + "category_name": score.category_name, + "id": span_id, + "name": score.name, + "reason": score.reason, + "value": score.value, + } + for score in span.feedback_scores + ] + + sorted_actual_feedback_scores = sorted( + actual_feedback_scores, key=lambda item: json.dumps(item, sort_keys=True) + ) + sorted_expected_feedback_scores = sorted( + feedback_scores, key=lambda item: json.dumps(item, sort_keys=True) + ) + for actual_score, expected_score in zip( + sorted_actual_feedback_scores, sorted_expected_feedback_scores + ): + testlib.assert_dicts_equal(actual_score, expected_score) + + +def verify_dataset( + opik_client: opik.Opik, + name: str, + description: str = mock.ANY, + dataset_items: List[dataset_item.DatasetItem] = mock.ANY, +): + if not synchronization.until( + lambda: (opik_client.get_dataset(name=name) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get dataset with name {name}.") + + actual_dataset = opik_client.get_dataset(name=name) + assert actual_dataset.description == description + + actual_dataset_items = actual_dataset.get_all_items() + assert ( + len(actual_dataset_items) == len(dataset_items) + ), f"Amount of actual dataset items ({len(actual_dataset_items)}) is not the same as of expected ones ({len(dataset_items)})" + + actual_dataset_items_dicts = [item.__dict__ for item in actual_dataset_items] + expected_dataset_items_dicts = [item.__dict__ for item in dataset_items] + + sorted_actual_items = sorted( + actual_dataset_items_dicts, key=lambda item: json.dumps(item, sort_keys=True) + ) + sorted_expected_items = sorted( + expected_dataset_items_dicts, key=lambda item: json.dumps(item, sort_keys=True) + ) + + for actual_item, expected_item in zip(sorted_actual_items, sorted_expected_items): + testlib.assert_dicts_equal(actual_item, expected_item, ignore_keys=["id"]) + + +def verify_experiment( + opik_client: opik.Opik, + id: str, + experiment_name: str, + feedback_scores_amount: int, + traces_amount: int, +): + rest_client = ( + opik_client._rest_client + ) # temporary solution until backend prepares proper endpoints + + rest_client.datasets.find_dataset_items_with_experiment_items + + if not synchronization.until( + lambda: (rest_client.experiments.get_experiment_by_id(id) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get experiment with id {id}.") + + experiment_content = rest_client.experiments.get_experiment_by_id(id) + + assert ( + experiment_content.name == experiment_name + ), f"{experiment_content.name} != {experiment_name}" + + actual_scores_count = ( + 0 + if experiment_content.feedback_scores is None + else len(experiment_content.feedback_scores) + ) + assert ( + actual_scores_count == feedback_scores_amount + ), f"{actual_scores_count} != {feedback_scores_amount}" + + actual_trace_count = ( + 0 if experiment_content.trace_count is None else experiment_content.trace_count + ) + assert ( + actual_trace_count == traces_amount + ), f"{actual_trace_count} != {traces_amount}" diff --git a/sdks/python/tests/library_integration/langchain/test_langchain.py b/sdks/python/tests/library_integration/langchain/test_langchain.py index e4176035e1..11c944ee5d 100644 --- a/sdks/python/tests/library_integration/langchain/test_langchain.py +++ b/sdks/python/tests/library_integration/langchain/test_langchain.py @@ -1,12 +1,12 @@ import mock import os from opik.message_processing import streamer_constructors -from ...testlib import fake_message_processor +from ...testlib import backend_emulator_message_processor from ...testlib import ( SpanModel, TraceModel, ANY_BUT_NONE, - assert_traces_match, + assert_equal, ) import pytest import opik @@ -30,7 +30,9 @@ def ensure_openai_configured(): def test_langchain__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -145,13 +147,15 @@ def test_langchain__happyflow( assert len(fake_message_processor_.trace_trees) == 1 assert len(callback.created_traces()) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_langchain__openai_llm_is_used__token_usage_is_logged__happyflow( fake_streamer, ensure_openai_configured ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -240,13 +244,15 @@ def test_langchain__openai_llm_is_used__token_usage_is_logged__happyflow( assert len(fake_message_processor_.trace_trees) == 1 assert len(callback.created_traces()) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_langchain_callback__used_inside_another_track_function__data_attached_to_existing_trace_tree( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -380,13 +386,15 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 assert len(callback.created_traces()) == 0 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_langchain_callback__used_when_there_was_already_existing_trace_without_span__data_attached_to_existing_trace( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -517,13 +525,15 @@ def f(): assert len(fake_message_processor_.trace_trees) == 1 assert len(callback.created_traces()) == 0 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_langchain_callback__used_when_there_was_already_existing_span_without_trace__data_attached_to_existing_span( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -653,4 +663,4 @@ def f(): assert len(fake_message_processor_.span_trees) == 1 assert len(callback.created_traces()) == 0 - assert_traces_match(EXPECTED_SPANS_TREE, fake_message_processor_.span_trees[0]) + assert_equal(EXPECTED_SPANS_TREE, fake_message_processor_.span_trees[0]) diff --git a/sdks/python/tests/library_integration/openai/test_openai.py b/sdks/python/tests/library_integration/openai/test_openai.py index e84418ef93..82fa7b9347 100644 --- a/sdks/python/tests/library_integration/openai/test_openai.py +++ b/sdks/python/tests/library_integration/openai/test_openai.py @@ -7,12 +7,12 @@ import opik from opik.message_processing import streamer_constructors from opik.integrations.openai import track_openai -from ...testlib import fake_message_processor +from ...testlib import backend_emulator_message_processor from ...testlib import ( SpanModel, TraceModel, ANY_BUT_NONE, - assert_traces_match, + assert_equal, ) @@ -29,7 +29,9 @@ def ensure_openai_configured(): def test_openai_client_chat_completions_create__happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -94,13 +96,15 @@ def test_openai_client_chat_completions_create__happyflow(fake_streamer): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__create_raises_an_error__span_and_trace_finished_gracefully( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -159,13 +163,15 @@ def test_openai_client_chat_completions_create__create_raises_an_error__span_and assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__openai_call_made_in_another_tracked_function__openai_span_attached_to_existing_trace( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -237,13 +243,15 @@ def f(): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__async_openai_call_made_in_another_tracked_async_function__openai_span_attached_to_existing_trace( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -315,13 +323,15 @@ async def async_f(): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__stream_mode_is_on__generator_tracked_correctly( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -395,13 +405,15 @@ def test_openai_client_chat_completions_create__stream_mode_is_on__generator_tra assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__async_openai_call_made_in_another_tracked_async_function__streaming_mode_enabled__openai_span_attached_to_existing_trace( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -479,4 +491,4 @@ async def async_f(): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) diff --git a/sdks/python/tests/testlib/__init__.py b/sdks/python/tests/testlib/__init__.py index 0c6b9e6c18..30ca39068c 100644 --- a/sdks/python/tests/testlib/__init__.py +++ b/sdks/python/tests/testlib/__init__.py @@ -1,10 +1,14 @@ -from .fake_message_processor import FakeMessageProcessor -from .testlib_dsl import SpanModel, TraceModel, ANY_BUT_NONE, assert_traces_match +from .backend_emulator_message_processor import BackendEmulatorMessageProcessor +from .models import SpanModel, TraceModel +from .assert_helpers import assert_dicts_equal, prepare_difference_report, assert_equal +from .any_but_none import ANY_BUT_NONE __all__ = [ "SpanModel", "TraceModel", "ANY_BUT_NONE", - "assert_traces_match", - "FakeMessageProcessor", + "assert_equal", + "assert_dicts_equal", + "prepare_difference_report", + "BackendEmulatorMessageProcessor", ] diff --git a/sdks/python/tests/testlib/any_but_none.py b/sdks/python/tests/testlib/any_but_none.py new file mode 100644 index 0000000000..babbe715a8 --- /dev/null +++ b/sdks/python/tests/testlib/any_but_none.py @@ -0,0 +1,17 @@ +class AnyButNone: + "A helper object that compares equal to everything but None." + + def __eq__(self, other): + if other is None: + return False + + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return "" + + +ANY_BUT_NONE = AnyButNone() diff --git a/sdks/python/tests/testlib/assert_helpers.py b/sdks/python/tests/testlib/assert_helpers.py new file mode 100644 index 0000000000..e0b0d032f1 --- /dev/null +++ b/sdks/python/tests/testlib/assert_helpers.py @@ -0,0 +1,40 @@ +from typing import List, Any, Optional, Dict + +import logging +import mock +import deepdiff + +from . import any_but_none + +LOGGER = logging.getLogger(__name__) + + +def prepare_difference_report(expected: Any, actual: Any) -> str: + try: + diff_report = deepdiff.DeepDiff( + expected, actual, exclude_types=[any_but_none.AnyButNone, mock.mock._ANY] + ).pretty() + return diff_report + except Exception: + LOGGER.debug("Failed to prepare difference report", exc_info=True) + return "" + + +def assert_equal(expected, actual): + assert actual == expected, prepare_difference_report(actual, expected) + + +def assert_dicts_equal( + dict1: Dict[str, Any], + dict2: Dict[str, Any], + ignore_keys: Optional[List[str]] = None, +) -> bool: + dict1_copy, dict2_copy = {**dict1}, {**dict2} + + ignore_keys = [] if ignore_keys is None else ignore_keys + + for key in ignore_keys: + dict1_copy.pop(key, None) + dict2_copy.pop(key, None) + + assert dict1_copy == dict2_copy, prepare_difference_report(dict1_copy, dict2_copy) diff --git a/sdks/python/tests/testlib/fake_message_processor.py b/sdks/python/tests/testlib/backend_emulator_message_processor.py similarity index 96% rename from sdks/python/tests/testlib/fake_message_processor.py rename to sdks/python/tests/testlib/backend_emulator_message_processor.py index 8d3fa35889..db4ac90e05 100644 --- a/sdks/python/tests/testlib/fake_message_processor.py +++ b/sdks/python/tests/testlib/backend_emulator_message_processor.py @@ -1,10 +1,10 @@ from opik.message_processing import message_processors, messages from typing import List, Tuple, Type, Dict, Union -from .testlib_dsl import TraceModel, SpanModel, FeedbackScoreModel +from .models import TraceModel, SpanModel, FeedbackScoreModel -class FakeMessageProcessor(message_processors.BaseMessageProcessor): +class BackendEmulatorMessageProcessor(message_processors.BaseMessageProcessor): def __init__(self) -> None: self.processed_messages: List[messages.BaseMessage] = [] self.trace_trees: List[TraceModel] = [] diff --git a/sdks/python/tests/testlib/models.py b/sdks/python/tests/testlib/models.py new file mode 100644 index 0000000000..c4c6823e87 --- /dev/null +++ b/sdks/python/tests/testlib/models.py @@ -0,0 +1,47 @@ +from typing import List, Any, Optional, Dict + +import dataclasses +import datetime + + +@dataclasses.dataclass +class SpanModel: + id: str + start_time: datetime.datetime + name: Optional[str] = None + input: Any = None + output: Any = None + tags: Optional[List[str]] = None + metadata: Optional[Dict[str, Any]] = None + type: str = "general" + usage: Optional[Dict[str, Any]] = None + end_time: Optional[datetime.datetime] = None + spans: List["SpanModel"] = dataclasses.field(default_factory=list) + feedback_scores: List["FeedbackScoreModel"] = dataclasses.field( + default_factory=list + ) + + +@dataclasses.dataclass +class TraceModel: + id: str + start_time: datetime.datetime + name: Optional[str] + input: Any = None + output: Any = None + tags: Optional[List[str]] = None + metadata: Optional[Dict[str, Any]] = None + end_time: Optional[datetime.datetime] = None + spans: List["SpanModel"] = dataclasses.field(default_factory=list) + feedback_scores: List["FeedbackScoreModel"] = dataclasses.field( + default_factory=list + ) + + +@dataclasses.dataclass +class FeedbackScoreModel: + id: str + name: str + value: float + category_name: Optional[str] = None + reason: Optional[str] = None diff --git a/sdks/python/tests/testlib/testlib_dsl.py b/sdks/python/tests/testlib/testlib_dsl.py deleted file mode 100644 index 88729fe509..0000000000 --- a/sdks/python/tests/testlib/testlib_dsl.py +++ /dev/null @@ -1,101 +0,0 @@ -from typing import List, Any, Optional, Dict - -import dataclasses -import logging -import unittest -import mock -import deepdiff -import datetime - - -LOGGER = logging.getLogger(__name__) - -# TODO: expand classes to have more attributes, current ones are considered to be -# a bare minimum for tests to check that traces have correct structure - - -@dataclasses.dataclass -class SpanModel: - id: str - start_time: datetime.datetime - name: Optional[str] = None - input: Any = None - output: Any = None - tags: Optional[List[str]] = None - metadata: Optional[Dict[str, Any]] = None - type: str = "general" - usage: Optional[Dict[str, Any]] = None - end_time: Optional[datetime.datetime] = None - spans: List["SpanModel"] = dataclasses.field(default_factory=list) - feedback_scores: List["FeedbackScoreModel"] = dataclasses.field( - default_factory=list - ) - - -@dataclasses.dataclass -class TraceModel: - id: str - start_time: datetime.datetime - name: Optional[str] - input: Any = None - output: Any = None - tags: Optional[List[str]] = None - metadata: Optional[Dict[str, Any]] = None - end_time: Optional[datetime.datetime] = None - spans: List["SpanModel"] = dataclasses.field(default_factory=list) - feedback_scores: List["FeedbackScoreModel"] = dataclasses.field( - default_factory=list - ) - - -@dataclasses.dataclass -class FeedbackScoreModel: - id: str - name: str - value: float - category_name: Optional[str] = None - reason: Optional[str] = None - - -class _AnyButNone: - "A helper object that compares equal to everything but None." - - def __eq__(self, other): - if other is None: - return False - - return True - - def __ne__(self, other): - return not self.__eq__(other) - - def __repr__(self): - return "" - - -def _prepare_difference_report(expected: Any, actual: Any) -> str: - try: - diff_report = deepdiff.DeepDiff( - expected, actual, exclude_types=[_AnyButNone, mock.mock._ANY] - ).pretty() - return diff_report - except Exception: - LOGGER.debug("Failed to prepare difference report", exc_info=True) - return "" - - -def assert_traces_match(trace_expected, trace_actual): - trace_expected = trace_expected.__dict__ - trace_actual = trace_actual.__dict__ - - test_case = unittest.TestCase() - test_case.maxDiff = None - - test_case.assertDictEqual( - trace_expected, - trace_actual, - msg="\n" + _prepare_difference_report(trace_expected, trace_actual), - ) - - -ANY_BUT_NONE = _AnyButNone() diff --git a/sdks/python/tests/unit/decorator/test_tracker_outputs.py b/sdks/python/tests/unit/decorator/test_tracker_outputs.py index 69edb9237b..7563d716ad 100644 --- a/sdks/python/tests/unit/decorator/test_tracker_outputs.py +++ b/sdks/python/tests/unit/decorator/test_tracker_outputs.py @@ -7,17 +7,19 @@ from opik import context_storage, opik_context from opik.api_objects import opik_client -from ...testlib import fake_message_processor +from ...testlib import backend_emulator_message_processor from ...testlib import ( SpanModel, TraceModel, ANY_BUT_NONE, - assert_traces_match, + assert_equal, ) def test_track__one_nested_function__happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -74,13 +76,15 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__one_nested_function__inputs_and_outputs_not_captured__inputs_and_outputs_initialized_with_Nones( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -122,13 +126,15 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__one_nested_function__output_is_dict__output_is_wrapped_by_tracker( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -170,11 +176,13 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__two_nested_functions__happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -246,13 +254,15 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__outer_function_has_two_separate_nested_function__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -323,11 +333,13 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__two_traces__happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -396,18 +408,16 @@ def f_2(x): assert len(fake_message_processor_.trace_trees) == 2 - assert_traces_match( - EXPECTED_TRACE_TREES[0], fake_message_processor_.trace_trees[0] - ) - assert_traces_match( - EXPECTED_TRACE_TREES[1], fake_message_processor_.trace_trees[1] - ) + assert_equal(EXPECTED_TRACE_TREES[0], fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREES[1], fake_message_processor_.trace_trees[1]) def test_track__one_function__error_raised__trace_and_span_finished_correctly__outputs_are_None( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -451,13 +461,15 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__one_async_function__error_raised__trace_and_span_finished_correctly__outputs_are_None( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -502,11 +514,13 @@ async def async_f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__nested_calls_in_separate_threads__3_traces_in_result(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -599,21 +613,17 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 3 - assert_traces_match( - EXPECTED_TRACE_TREES[0], fake_message_processor_.trace_trees[0] - ) - assert_traces_match( - EXPECTED_TRACE_TREES[1], fake_message_processor_.trace_trees[1] - ) - assert_traces_match( - EXPECTED_TRACE_TREES[2], fake_message_processor_.trace_trees[2] - ) + assert_equal(EXPECTED_TRACE_TREES[0], fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREES[1], fake_message_processor_.trace_trees[1]) + assert_equal(EXPECTED_TRACE_TREES[2], fake_message_processor_.trace_trees[2]) def test_track__single_generator_function_tracked__generator_exhausted__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -660,13 +670,15 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__generator_function_tracked__generator_exhausted_in_another_tracked_function__trace_tree_remains_correct( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -744,13 +756,15 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__single_async_function_tracked__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -794,13 +808,15 @@ async def async_f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__nested_async_function_tracked__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -859,13 +875,15 @@ async def async_f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__single_async_generator_function_tracked__generator_exhausted__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -915,13 +933,15 @@ async def async_generator_user(): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__async_generator_inside_another_tracked_function__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -986,13 +1006,15 @@ async def async_generator_user(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__distributed_tracing_with_headers__tracing_is_performed_in_2_threads__all_data_is_saved_in_1_trace_tree( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -1061,13 +1083,15 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__trace_already_created_not_by_decorator__decorator_just_attaches_new_span_to_it__trace_is_not_popped_from_context_in_the_end( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -1121,4 +1145,4 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) diff --git a/sdks/python/tests/unit/evaluation/test_evaluate.py b/sdks/python/tests/unit/evaluation/test_evaluate.py index cb13cadddc..3a4f52556f 100644 --- a/sdks/python/tests/unit/evaluation/test_evaluate.py +++ b/sdks/python/tests/unit/evaluation/test_evaluate.py @@ -3,18 +3,18 @@ from opik.api_objects import opik_client from opik import evaluation from opik.evaluation import metrics -from ...testlib import fake_message_processor -from ...testlib.testlib_dsl import ( +from ...testlib import backend_emulator_message_processor, ANY_BUT_NONE, assert_equal +from ...testlib.models import ( TraceModel, FeedbackScoreModel, - ANY_BUT_NONE, - assert_traces_match, ) from opik.message_processing import streamer_constructors def test_evaluate_happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_dataset = mock.Mock() @@ -68,7 +68,6 @@ def say_task(dataset_item: dataset_item.DatasetItem): experiment_name="the-experiment-name", scoring_metrics=[metrics.Equals()], task_threads=1, - scoring_threads=1, ) mock_create_experiment.assert_called_once_with( @@ -121,4 +120,4 @@ def say_task(dataset_item: dataset_item.DatasetItem): for expected_trace, actual_trace in zip( EXPECTED_TRACE_TREES, fake_message_processor_.trace_trees ): - assert_traces_match(expected_trace, actual_trace) + assert_equal(expected_trace, actual_trace)