From 6bbb60102dc760a79ab58c85c3dd9d09e472637a Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Thu, 5 Sep 2024 15:23:46 +0200 Subject: [PATCH 01/10] Add e2e test for running tracked function --- sdks/python/src/opik/synchronization.py | 49 ++++++++++++- sdks/python/tests/e2e/__init__.py | 0 sdks/python/tests/e2e/conftest.py | 23 ++++++ .../e2e/test_track_decorator_function.py | 70 +++++++++++++++++++ sdks/python/tests/e2e/verifiers/__init__.py | 7 ++ sdks/python/tests/e2e/verifiers/verifiers.py | 65 +++++++++++++++++ 6 files changed, 212 insertions(+), 2 deletions(-) create mode 100644 sdks/python/tests/e2e/__init__.py create mode 100644 sdks/python/tests/e2e/conftest.py create mode 100644 sdks/python/tests/e2e/test_track_decorator_function.py create mode 100644 sdks/python/tests/e2e/verifiers/__init__.py create mode 100644 sdks/python/tests/e2e/verifiers/verifiers.py diff --git a/sdks/python/src/opik/synchronization.py b/sdks/python/src/opik/synchronization.py index 2bc4cf9b53..c61dc5b7b1 100644 --- a/sdks/python/src/opik/synchronization.py +++ b/sdks/python/src/opik/synchronization.py @@ -1,9 +1,13 @@ import time -from typing import Callable, Optional +import logging +from typing import Callable, Optional, Any + + +LOGGER = logging.getLogger(__name__) def wait_for_done( - check_function: Callable, + check_function: Callable[[], bool], timeout: Optional[float], progress_callback: Optional[Callable] = None, sleep_time: float = 1, @@ -19,3 +23,44 @@ def wait_for_done( end_sleep_time = time.time() + sleep_time while check_function() is False and time.time() < end_sleep_time: time.sleep(sleep_time / 20.0) + + +def until( + function: Callable[[], bool], + sleep: float = 0.5, + max_try_seconds: float = 10, + allow_errors: bool = False, +) -> bool: + start_time = time.time() + while True: + try: + if function(): + break + except Exception: + LOGGER.debug( + f"{function.__name__} raised error in 'until' function.", exc_info=True + ) + if not allow_errors: + raise + finally: + if (time.time() - start_time) > max_try_seconds: + return False + time.sleep(sleep) + return True + + +def try_get_until( + function: Callable[[], Any], sleep: float = 0.5, max_try_seconds: float = 10 +) -> Any: + """ + As soon + """ + start_time = time.time() + while True: + try: + return function() + except Exception: + if (time.time() - start_time) > max_try_seconds: + raise + time.sleep(sleep) + return True diff --git a/sdks/python/tests/e2e/__init__.py b/sdks/python/tests/e2e/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdks/python/tests/e2e/conftest.py b/sdks/python/tests/e2e/conftest.py new file mode 100644 index 0000000000..f8a6029690 --- /dev/null +++ b/sdks/python/tests/e2e/conftest.py @@ -0,0 +1,23 @@ +import os +from opik import config +from opik.rest_api import client +from opik import httpx_client + +import pytest + + +@pytest.fixture(scope="session") +def configure_e2e_tests_env(): + os.environ["OPIK_PROJECT_NAME"] = "e2e-tests" + os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" + + +@pytest.fixture(scope="session") +def rest_api_client(configure_e2e_tests_env): + config_ = config.OpikConfig() + httpx_client_ = httpx_client.get(workspace=config_.workspace, api_key=None) + + rest_api_client_ = client.OpikApi( + base_url=config_.url_override, httpx_client=httpx_client_ + ) + return rest_api_client_ diff --git a/sdks/python/tests/e2e/test_track_decorator_function.py b/sdks/python/tests/e2e/test_track_decorator_function.py new file mode 100644 index 0000000000..f2e79a20e6 --- /dev/null +++ b/sdks/python/tests/e2e/test_track_decorator_function.py @@ -0,0 +1,70 @@ +from . import verifiers +import opik + + +from opik import opik_context + + +def test_tracked_function__happyflow(rest_api_client): + # Setup + ID_STORAGE = {} + + @opik.track( + tags=["outer-tag1", "outer-tag2"], + metadata={"outer-metadata-key": "outer-metadata-value"}, + ) + def f_outer(x): + ID_STORAGE["f_outer-trace-id"] = opik_context.get_current_trace().id + ID_STORAGE["f_outer-span-id"] = opik_context.get_current_span().id + + f_inner("inner-input") + return "outer-output" + + @opik.track( + tags=["inner-tag1", "inner-tag2"], + metadata={"inner-metadata-key": "inner-metadata-value"}, + ) + def f_inner(y): + ID_STORAGE["f_inner-span-id"] = opik_context.get_current_span().id + return "inner-output" + + # Call + f_outer("outer-input") + opik.flush_tracker() + + # Verify trace + verifiers.verify_trace( + rest_client=rest_api_client, + trace_id=ID_STORAGE["f_outer-trace-id"], + name="f_outer", + input={"x": "outer-input"}, + output={"output": "outer-output"}, + metadata={"outer-metadata-key": "outer-metadata-value"}, + tags=["outer-tag1", "outer-tag2"], + ) + + # Verify top level span + verifiers.verify_span( + rest_client=rest_api_client, + span_id=ID_STORAGE["f_outer-span-id"], + parent_span_id=None, + trace_id=ID_STORAGE["f_outer-trace-id"], + name="f_outer", + input={"x": "outer-input"}, + output={"output": "outer-output"}, + metadata={"outer-metadata-key": "outer-metadata-value"}, + tags=["outer-tag1", "outer-tag2"], + ) + + # Verify nested span + verifiers.verify_span( + rest_client=rest_api_client, + span_id=ID_STORAGE["f_inner-span-id"], + parent_span_id=ID_STORAGE["f_outer-span-id"], + trace_id=ID_STORAGE["f_outer-trace-id"], + name="f_inner", + input={"y": "inner-input"}, + output={"output": "inner-output"}, + metadata={"inner-metadata-key": "inner-metadata-value"}, + tags=["inner-tag1", "inner-tag2"], + ) diff --git a/sdks/python/tests/e2e/verifiers/__init__.py b/sdks/python/tests/e2e/verifiers/__init__.py new file mode 100644 index 0000000000..07552be02a --- /dev/null +++ b/sdks/python/tests/e2e/verifiers/__init__.py @@ -0,0 +1,7 @@ +from .verifiers import verify_trace, verify_span + + +__all__ = [ + "verify_trace", + "verify_span", +] diff --git a/sdks/python/tests/e2e/verifiers/verifiers.py b/sdks/python/tests/e2e/verifiers/verifiers.py new file mode 100644 index 0000000000..aad7f4aeba --- /dev/null +++ b/sdks/python/tests/e2e/verifiers/verifiers.py @@ -0,0 +1,65 @@ +from typing import Optional, Dict, Any, List +from opik.rest_api import client as rest_api_client +from opik import synchronization +import mock + + +def verify_trace( + rest_client: rest_api_client.OpikApi, + trace_id: str, + name: str = mock.ANY, # type: ignore + metadata: Dict[str, Any] = mock.ANY, # type: ignore + input: Dict[str, Any] = mock.ANY, # type: ignore + output: Dict[str, Any] = mock.ANY, # type: ignore + tags: List[str] = mock.ANY, # type: ignore +): + if not synchronization.until( + lambda: (rest_client.traces.get_trace_by_id(id=trace_id) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get trace with id {trace_id}.") + + trace = rest_client.traces.get_trace_by_id(id=trace_id) + + assert trace.name == name, f"{trace.name} != {name}" + assert trace.input == input, f"{trace.input} != {input}" + assert trace.output == output, f"{trace.output} != {output}" + assert trace.tags == tags, f"{trace.tags} != {tags}" + assert trace.metadata == metadata, f"{trace.metadata} != {metadata}" + + +def verify_span( + rest_client: rest_api_client.OpikApi, + span_id: str, + trace_id: str, + parent_span_id: Optional[str], + name: str = mock.ANY, # type: ignore + metadata: Dict[str, Any] = mock.ANY, # type: ignore + input: Dict[str, Any] = mock.ANY, # type: ignore + output: Dict[str, Any] = mock.ANY, # type: ignore + tags: List[str] = mock.ANY, # type: ignore + type: str = mock.ANY, # type: ignore +): + if not synchronization.until( + lambda: (rest_client.spans.get_span_by_id(id=span_id) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get span with id {id}.") + + span = rest_client.spans.get_span_by_id(id=span_id) + + assert span.trace_id == trace_id, f"{span.trace_id} != {trace_id}" + + if parent_span_id is None: + assert span.parent_span_id is None, f"{span.parent_span_id} != {parent_span_id}" + else: + assert ( + span.parent_span_id == parent_span_id + ), f"{span.parent_span_id} != {parent_span_id}" + + assert span.name == name, f"{span.name} != {name}" + assert span.input == input, f"{span.input} != {input}" + assert span.output == output, f"{span.output} != {output}" + assert span.tags == tags, f"{span.tags} != {tags}" + assert span.metadata == metadata, f"{span.metadata} != {metadata}" + assert span.type == type, f"{span.type} != {type}" From fea4b07e28a4dac1f242082d7692dd68eed63104 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Thu, 5 Sep 2024 16:39:07 +0200 Subject: [PATCH 02/10] Replace low level api client usage with Opik class usage in e2e tests. Add test for dataset creation and population. --- .../src/opik/api_objects/opik_client.py | 8 +- sdks/python/tests/e2e/conftest.py | 15 +-- .../e2e/test_track_decorator_function.py | 8 +- sdks/python/tests/e2e/verifiers/__init__.py | 3 +- sdks/python/tests/e2e/verifiers/verifiers.py | 96 +++++++++++++++---- sdks/python/tests/testlib/testlib_dsl.py | 4 +- 6 files changed, 100 insertions(+), 34 deletions(-) diff --git a/sdks/python/src/opik/api_objects/opik_client.py b/sdks/python/src/opik/api_objects/opik_client.py index 860f01e710..ecd578bc07 100644 --- a/sdks/python/src/opik/api_objects/opik_client.py +++ b/sdks/python/src/opik/api_objects/opik_client.py @@ -17,7 +17,7 @@ ) from ..message_processing import streamer_constructors, messages from ..rest_api import client as rest_api_client -from ..rest_api.types import dataset_public +from ..rest_api.types import dataset_public, trace_public, span_public from .. import datetime_helpers, config, httpx_client @@ -377,6 +377,12 @@ def flush(self, timeout: Optional[int] = None) -> None: timeout = timeout if timeout is not None else self._flush_timeout self._streamer.flush(timeout) + def get_trace_content(self, id: str) -> trace_public.TracePublic: + return self._rest_client.traces.get_trace_by_id(id) + + def get_span_content(self, id: str) -> span_public.SpanPublic: + return self._rest_client.spans.get_span_by_id(id) + @functools.lru_cache() def get_client_cached() -> Opik: diff --git a/sdks/python/tests/e2e/conftest.py b/sdks/python/tests/e2e/conftest.py index f8a6029690..56618e48bc 100644 --- a/sdks/python/tests/e2e/conftest.py +++ b/sdks/python/tests/e2e/conftest.py @@ -1,7 +1,6 @@ import os -from opik import config -from opik.rest_api import client -from opik import httpx_client + +import opik.api_objects.opik_client import pytest @@ -13,11 +12,7 @@ def configure_e2e_tests_env(): @pytest.fixture(scope="session") -def rest_api_client(configure_e2e_tests_env): - config_ = config.OpikConfig() - httpx_client_ = httpx_client.get(workspace=config_.workspace, api_key=None) +def opik_client(configure_e2e_tests_env): + opik_client_ = opik.api_objects.opik_client.get_client_cached() - rest_api_client_ = client.OpikApi( - base_url=config_.url_override, httpx_client=httpx_client_ - ) - return rest_api_client_ + return opik_client_ diff --git a/sdks/python/tests/e2e/test_track_decorator_function.py b/sdks/python/tests/e2e/test_track_decorator_function.py index f2e79a20e6..0f3dfcfd89 100644 --- a/sdks/python/tests/e2e/test_track_decorator_function.py +++ b/sdks/python/tests/e2e/test_track_decorator_function.py @@ -5,7 +5,7 @@ from opik import opik_context -def test_tracked_function__happyflow(rest_api_client): +def test_tracked_function__happyflow(opik_client): # Setup ID_STORAGE = {} @@ -34,7 +34,7 @@ def f_inner(y): # Verify trace verifiers.verify_trace( - rest_client=rest_api_client, + opik_client=opik_client, trace_id=ID_STORAGE["f_outer-trace-id"], name="f_outer", input={"x": "outer-input"}, @@ -45,7 +45,7 @@ def f_inner(y): # Verify top level span verifiers.verify_span( - rest_client=rest_api_client, + opik_client=opik_client, span_id=ID_STORAGE["f_outer-span-id"], parent_span_id=None, trace_id=ID_STORAGE["f_outer-trace-id"], @@ -58,7 +58,7 @@ def f_inner(y): # Verify nested span verifiers.verify_span( - rest_client=rest_api_client, + opik_client=opik_client, span_id=ID_STORAGE["f_inner-span-id"], parent_span_id=ID_STORAGE["f_outer-span-id"], trace_id=ID_STORAGE["f_outer-trace-id"], diff --git a/sdks/python/tests/e2e/verifiers/__init__.py b/sdks/python/tests/e2e/verifiers/__init__.py index 07552be02a..b83846bb52 100644 --- a/sdks/python/tests/e2e/verifiers/__init__.py +++ b/sdks/python/tests/e2e/verifiers/__init__.py @@ -1,7 +1,8 @@ -from .verifiers import verify_trace, verify_span +from .verifiers import verify_trace, verify_span, verify_dataset __all__ = [ "verify_trace", "verify_span", + "verify_dataset", ] diff --git a/sdks/python/tests/e2e/verifiers/verifiers.py b/sdks/python/tests/e2e/verifiers/verifiers.py index aad7f4aeba..116a87c30a 100644 --- a/sdks/python/tests/e2e/verifiers/verifiers.py +++ b/sdks/python/tests/e2e/verifiers/verifiers.py @@ -1,11 +1,16 @@ from typing import Optional, Dict, Any, List -from opik.rest_api import client as rest_api_client +import opik +import json + +from opik.api_objects.dataset import dataset_item from opik import synchronization + +from ...testlib import testlib_dsl import mock def verify_trace( - rest_client: rest_api_client.OpikApi, + opik_client: opik.Opik, trace_id: str, name: str = mock.ANY, # type: ignore metadata: Dict[str, Any] = mock.ANY, # type: ignore @@ -14,22 +19,28 @@ def verify_trace( tags: List[str] = mock.ANY, # type: ignore ): if not synchronization.until( - lambda: (rest_client.traces.get_trace_by_id(id=trace_id) is not None), + lambda: (opik_client.get_trace_content(id=trace_id) is not None), allow_errors=True, ): raise AssertionError(f"Failed to get trace with id {trace_id}.") - trace = rest_client.traces.get_trace_by_id(id=trace_id) + trace = opik_client.get_trace_content(id=trace_id) assert trace.name == name, f"{trace.name} != {name}" - assert trace.input == input, f"{trace.input} != {input}" - assert trace.output == output, f"{trace.output} != {output}" - assert trace.tags == tags, f"{trace.tags} != {tags}" - assert trace.metadata == metadata, f"{trace.metadata} != {metadata}" + assert trace.input == input, testlib_dsl.prepare_difference_report( + trace.input, input + ) + assert trace.output == output, testlib_dsl.prepare_difference_report( + trace.output, output + ) + assert trace.metadata == metadata, testlib_dsl.prepare_difference_report( + trace.metadata, metadata + ) + assert trace.tags == tags, testlib_dsl.prepare_difference_report(trace.tags, tags) def verify_span( - rest_client: rest_api_client.OpikApi, + opik_client: opik.Opik, span_id: str, trace_id: str, parent_span_id: Optional[str], @@ -41,12 +52,12 @@ def verify_span( type: str = mock.ANY, # type: ignore ): if not synchronization.until( - lambda: (rest_client.spans.get_span_by_id(id=span_id) is not None), + lambda: (opik_client.get_span_content(id=span_id) is not None), allow_errors=True, ): - raise AssertionError(f"Failed to get span with id {id}.") + raise AssertionError(f"Failed to get span with id {span_id}.") - span = rest_client.spans.get_span_by_id(id=span_id) + span = opik_client.get_span_content(id=span_id) assert span.trace_id == trace_id, f"{span.trace_id} != {trace_id}" @@ -58,8 +69,61 @@ def verify_span( ), f"{span.parent_span_id} != {parent_span_id}" assert span.name == name, f"{span.name} != {name}" - assert span.input == input, f"{span.input} != {input}" - assert span.output == output, f"{span.output} != {output}" - assert span.tags == tags, f"{span.tags} != {tags}" - assert span.metadata == metadata, f"{span.metadata} != {metadata}" assert span.type == type, f"{span.type} != {type}" + + assert span.input == input, testlib_dsl.prepare_difference_report(span.input, input) + assert span.output == output, testlib_dsl.prepare_difference_report( + span.output, output + ) + assert span.metadata == metadata, testlib_dsl.prepare_difference_report( + span.metadata, metadata + ) + assert span.tags == tags, testlib_dsl.prepare_difference_report(span.tags, tags) + + +def verify_dataset( + opik_client: opik.Opik, + name: str, + description: str = mock.ANY, + dataset_items: List[dataset_item.DatasetItem] = mock.ANY, +): + if not synchronization.until( + lambda: (opik_client.get_dataset(name=name) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get dataset with name {name}.") + + actual_dataset = opik_client.get_dataset(name=name) + assert actual_dataset.description == description + + actual_dataset_items = actual_dataset.get_all_items() + actual_dataset_items_dicts = [item.__dict__ for item in actual_dataset_items] + expected_dataset_items_dicts = [item.__dict__ for item in dataset_items] + + sorted_actual_items = sorted( + actual_dataset_items_dicts, key=lambda item: json.dumps(item, sort_keys=True) + ) + sorted_expected_items = sorted( + expected_dataset_items_dicts, key=lambda item: json.dumps(item, sort_keys=True) + ) + + for actual_item, expected_item in zip(sorted_actual_items, sorted_expected_items): + _assert_dicts_equal(actual_item, expected_item, ignore_keys=["id"]) + + +def _assert_dicts_equal( + dict1: Dict[str, Any], + dict2: Dict[str, Any], + ignore_keys: Optional[List[str]] = None, +) -> bool: + dict1_copy, dict2_copy = {**dict1}, {**dict2} + + ignore_keys = [] if ignore_keys is None else ignore_keys + + for key in ignore_keys: + dict1_copy.pop(key, None) + dict2_copy.pop(key, None) + + assert dict1_copy == dict2_copy, testlib_dsl.prepare_difference_report( + dict1_copy, dict2_copy + ) diff --git a/sdks/python/tests/testlib/testlib_dsl.py b/sdks/python/tests/testlib/testlib_dsl.py index 88729fe509..e12b8d2243 100644 --- a/sdks/python/tests/testlib/testlib_dsl.py +++ b/sdks/python/tests/testlib/testlib_dsl.py @@ -73,7 +73,7 @@ def __repr__(self): return "" -def _prepare_difference_report(expected: Any, actual: Any) -> str: +def prepare_difference_report(expected: Any, actual: Any) -> str: try: diff_report = deepdiff.DeepDiff( expected, actual, exclude_types=[_AnyButNone, mock.mock._ANY] @@ -94,7 +94,7 @@ def assert_traces_match(trace_expected, trace_actual): test_case.assertDictEqual( trace_expected, trace_actual, - msg="\n" + _prepare_difference_report(trace_expected, trace_actual), + msg="\n" + prepare_difference_report(trace_expected, trace_actual), ) From c6a9cd1db58d863beb3db6a863e246a22357a2f9 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Thu, 5 Sep 2024 16:51:57 +0200 Subject: [PATCH 03/10] Add missing files, add e2e test for dataset --- sdks/python/tests/e2e/test_dataset.py | 53 +++++++++++++++++++ ..._decorator_function.py => test_tracing.py} | 44 +++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 sdks/python/tests/e2e/test_dataset.py rename sdks/python/tests/e2e/{test_track_decorator_function.py => test_tracing.py} (62%) diff --git a/sdks/python/tests/e2e/test_dataset.py b/sdks/python/tests/e2e/test_dataset.py new file mode 100644 index 0000000000..c1f4956793 --- /dev/null +++ b/sdks/python/tests/e2e/test_dataset.py @@ -0,0 +1,53 @@ +import opik +import random +import string +from . import verifiers +from opik.api_objects.dataset import dataset_item + + +def test_create_and_populate_dataset__happyflow(opik_client: opik.Opik): + DATASET_NAME = "e2e-tests-dataset-".join( + random.choice(string.ascii_letters) for _ in range(6) + ) + DESCRIPTION = "E2E test dataset" + + dataset = opik_client.create_dataset(DATASET_NAME, description=DESCRIPTION) + + dataset.insert( + [ + { + "input": {"question": "What is the of capital of France?"}, + "expected_output": {"output": "Paris"}, + }, + { + "input": {"question": "What is the of capital of Germany?"}, + "expected_output": {"output": "Berlin"}, + }, + { + "input": {"question": "What is the of capital of Poland?"}, + "expected_output": {"output": "Warsaw"}, + }, + ] + ) + + EXPECTED_DATASET_ITEMS = [ + dataset_item.DatasetItem( + input={"question": "What is the of capital of France?"}, + expected_output={"output": "Paris"}, + ), + dataset_item.DatasetItem( + input={"question": "What is the of capital of Germany?"}, + expected_output={"output": "Berlin"}, + ), + dataset_item.DatasetItem( + input={"question": "What is the of capital of Poland?"}, + expected_output={"output": "Warsaw"}, + ), + ] + + verifiers.verify_dataset( + opik_client=opik_client, + name=DATASET_NAME, + description=DESCRIPTION, + dataset_items=EXPECTED_DATASET_ITEMS, + ) diff --git a/sdks/python/tests/e2e/test_track_decorator_function.py b/sdks/python/tests/e2e/test_tracing.py similarity index 62% rename from sdks/python/tests/e2e/test_track_decorator_function.py rename to sdks/python/tests/e2e/test_tracing.py index 0f3dfcfd89..eb0f240f17 100644 --- a/sdks/python/tests/e2e/test_track_decorator_function.py +++ b/sdks/python/tests/e2e/test_tracing.py @@ -68,3 +68,47 @@ def f_inner(y): metadata={"inner-metadata-key": "inner-metadata-value"}, tags=["inner-tag1", "inner-tag2"], ) + + +def test_manually_created_trace_and_span__happyflow(opik_client: opik.Opik): + # Call + trace = opik_client.trace( + name="trace-name", + input={"input": "trace-input"}, + output={"output": "trace-output"}, + tags=["trace-tag"], + metadata={"trace-metadata-key": "trace-metadata-value"}, + ) + span = trace.span( + name="span-name", + input={"input": "span-input"}, + output={"output": "span-output"}, + tags=["span-tag"], + metadata={"span-metadata-key": "span-metadata-value"}, + ) + + opik_client.flush() + + # Verify trace + verifiers.verify_trace( + opik_client=opik_client, + trace_id=trace.id, + name="trace-name", + input={"input": "trace-input"}, + output={"output": "trace-output"}, + tags=["trace-tag"], + metadata={"trace-metadata-key": "trace-metadata-value"}, + ) + + # Verify span + verifiers.verify_span( + opik_client=opik_client, + span_id=span.id, + parent_span_id=None, + trace_id=span.trace_id, + name="span-name", + input={"input": "span-input"}, + output={"output": "span-output"}, + tags=["span-tag"], + metadata={"span-metadata-key": "span-metadata-value"}, + ) From c8d3113640ce909c26656e7ff1029509457242a8 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Thu, 5 Sep 2024 16:56:42 +0200 Subject: [PATCH 04/10] Add comparison for the amount of dataset items --- sdks/python/tests/e2e/verifiers/verifiers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sdks/python/tests/e2e/verifiers/verifiers.py b/sdks/python/tests/e2e/verifiers/verifiers.py index 116a87c30a..0a5418c4bb 100644 --- a/sdks/python/tests/e2e/verifiers/verifiers.py +++ b/sdks/python/tests/e2e/verifiers/verifiers.py @@ -97,6 +97,10 @@ def verify_dataset( assert actual_dataset.description == description actual_dataset_items = actual_dataset.get_all_items() + assert ( + len(actual_dataset_items) == len(dataset_items) + ), f"Amount of actual dataset items ({len(actual_dataset_items)}) is not the same as of expected ones ({len(dataset_items)})" + actual_dataset_items_dicts = [item.__dict__ for item in actual_dataset_items] expected_dataset_items_dicts = [item.__dict__ for item in dataset_items] From c22b4914c94dc05ec1ab03b1866f8f9197f04823 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Fri, 6 Sep 2024 11:05:54 +0200 Subject: [PATCH 05/10] Refactor testlib --- sdks/python/tests/conftest.py | 6 +- sdks/python/tests/e2e/verifiers/verifiers.py | 42 ++---- .../langchain/test_langchain.py | 34 +++-- .../library_integration/openai/test_openai.py | 40 ++++-- sdks/python/tests/testlib/__init__.py | 12 +- sdks/python/tests/testlib/any_but_none.py | 17 +++ sdks/python/tests/testlib/assert_helpers.py | 40 ++++++ ... => backend_emulator_message_processor.py} | 4 +- sdks/python/tests/testlib/models.py | 47 +++++++ sdks/python/tests/testlib/testlib_dsl.py | 101 --------------- .../unit/decorator/test_tracker_outputs.py | 122 +++++++++++------- .../tests/unit/evaluation/test_evaluate.py | 12 +- 12 files changed, 256 insertions(+), 221 deletions(-) create mode 100644 sdks/python/tests/testlib/any_but_none.py create mode 100644 sdks/python/tests/testlib/assert_helpers.py rename sdks/python/tests/testlib/{fake_message_processor.py => backend_emulator_message_processor.py} (96%) create mode 100644 sdks/python/tests/testlib/models.py delete mode 100644 sdks/python/tests/testlib/testlib_dsl.py diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py index 9ae6f18c18..8f388da27c 100644 --- a/sdks/python/tests/conftest.py +++ b/sdks/python/tests/conftest.py @@ -1,7 +1,7 @@ import pytest from opik import context_storage from opik.api_objects import opik_client -from .testlib import fake_message_processor +from .testlib import backend_emulator_message_processor from opik.message_processing import streamer_constructors @@ -22,7 +22,9 @@ def shutdown_cached_client(): @pytest.fixture def fake_streamer(): try: - fake_message_processor_ = fake_message_processor.FakeMessageProcessor() + fake_message_processor_ = ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor() + ) streamer = streamer_constructors.construct_streamer( message_processor=fake_message_processor_, n_consumers=1, diff --git a/sdks/python/tests/e2e/verifiers/verifiers.py b/sdks/python/tests/e2e/verifiers/verifiers.py index 0a5418c4bb..ec3cfd6f13 100644 --- a/sdks/python/tests/e2e/verifiers/verifiers.py +++ b/sdks/python/tests/e2e/verifiers/verifiers.py @@ -5,7 +5,7 @@ from opik.api_objects.dataset import dataset_item from opik import synchronization -from ...testlib import testlib_dsl +from ... import testlib import mock @@ -27,16 +27,14 @@ def verify_trace( trace = opik_client.get_trace_content(id=trace_id) assert trace.name == name, f"{trace.name} != {name}" - assert trace.input == input, testlib_dsl.prepare_difference_report( - trace.input, input - ) - assert trace.output == output, testlib_dsl.prepare_difference_report( + assert trace.input == input, testlib.prepare_difference_report(trace.input, input) + assert trace.output == output, testlib.prepare_difference_report( trace.output, output ) - assert trace.metadata == metadata, testlib_dsl.prepare_difference_report( + assert trace.metadata == metadata, testlib.prepare_difference_report( trace.metadata, metadata ) - assert trace.tags == tags, testlib_dsl.prepare_difference_report(trace.tags, tags) + assert trace.tags == tags, testlib.prepare_difference_report(trace.tags, tags) def verify_span( @@ -71,14 +69,12 @@ def verify_span( assert span.name == name, f"{span.name} != {name}" assert span.type == type, f"{span.type} != {type}" - assert span.input == input, testlib_dsl.prepare_difference_report(span.input, input) - assert span.output == output, testlib_dsl.prepare_difference_report( - span.output, output - ) - assert span.metadata == metadata, testlib_dsl.prepare_difference_report( + assert span.input == input, testlib.prepare_difference_report(span.input, input) + assert span.output == output, testlib.prepare_difference_report(span.output, output) + assert span.metadata == metadata, testlib.prepare_difference_report( span.metadata, metadata ) - assert span.tags == tags, testlib_dsl.prepare_difference_report(span.tags, tags) + assert span.tags == tags, testlib.prepare_difference_report(span.tags, tags) def verify_dataset( @@ -112,22 +108,4 @@ def verify_dataset( ) for actual_item, expected_item in zip(sorted_actual_items, sorted_expected_items): - _assert_dicts_equal(actual_item, expected_item, ignore_keys=["id"]) - - -def _assert_dicts_equal( - dict1: Dict[str, Any], - dict2: Dict[str, Any], - ignore_keys: Optional[List[str]] = None, -) -> bool: - dict1_copy, dict2_copy = {**dict1}, {**dict2} - - ignore_keys = [] if ignore_keys is None else ignore_keys - - for key in ignore_keys: - dict1_copy.pop(key, None) - dict2_copy.pop(key, None) - - assert dict1_copy == dict2_copy, testlib_dsl.prepare_difference_report( - dict1_copy, dict2_copy - ) + testlib.assert_dicts_equal(actual_item, expected_item, ignore_keys=["id"]) diff --git a/sdks/python/tests/library_integration/langchain/test_langchain.py b/sdks/python/tests/library_integration/langchain/test_langchain.py index e4176035e1..11c944ee5d 100644 --- a/sdks/python/tests/library_integration/langchain/test_langchain.py +++ b/sdks/python/tests/library_integration/langchain/test_langchain.py @@ -1,12 +1,12 @@ import mock import os from opik.message_processing import streamer_constructors -from ...testlib import fake_message_processor +from ...testlib import backend_emulator_message_processor from ...testlib import ( SpanModel, TraceModel, ANY_BUT_NONE, - assert_traces_match, + assert_equal, ) import pytest import opik @@ -30,7 +30,9 @@ def ensure_openai_configured(): def test_langchain__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -145,13 +147,15 @@ def test_langchain__happyflow( assert len(fake_message_processor_.trace_trees) == 1 assert len(callback.created_traces()) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_langchain__openai_llm_is_used__token_usage_is_logged__happyflow( fake_streamer, ensure_openai_configured ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -240,13 +244,15 @@ def test_langchain__openai_llm_is_used__token_usage_is_logged__happyflow( assert len(fake_message_processor_.trace_trees) == 1 assert len(callback.created_traces()) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_langchain_callback__used_inside_another_track_function__data_attached_to_existing_trace_tree( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -380,13 +386,15 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 assert len(callback.created_traces()) == 0 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_langchain_callback__used_when_there_was_already_existing_trace_without_span__data_attached_to_existing_trace( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -517,13 +525,15 @@ def f(): assert len(fake_message_processor_.trace_trees) == 1 assert len(callback.created_traces()) == 0 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_langchain_callback__used_when_there_was_already_existing_span_without_trace__data_attached_to_existing_span( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -653,4 +663,4 @@ def f(): assert len(fake_message_processor_.span_trees) == 1 assert len(callback.created_traces()) == 0 - assert_traces_match(EXPECTED_SPANS_TREE, fake_message_processor_.span_trees[0]) + assert_equal(EXPECTED_SPANS_TREE, fake_message_processor_.span_trees[0]) diff --git a/sdks/python/tests/library_integration/openai/test_openai.py b/sdks/python/tests/library_integration/openai/test_openai.py index e84418ef93..82fa7b9347 100644 --- a/sdks/python/tests/library_integration/openai/test_openai.py +++ b/sdks/python/tests/library_integration/openai/test_openai.py @@ -7,12 +7,12 @@ import opik from opik.message_processing import streamer_constructors from opik.integrations.openai import track_openai -from ...testlib import fake_message_processor +from ...testlib import backend_emulator_message_processor from ...testlib import ( SpanModel, TraceModel, ANY_BUT_NONE, - assert_traces_match, + assert_equal, ) @@ -29,7 +29,9 @@ def ensure_openai_configured(): def test_openai_client_chat_completions_create__happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -94,13 +96,15 @@ def test_openai_client_chat_completions_create__happyflow(fake_streamer): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__create_raises_an_error__span_and_trace_finished_gracefully( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -159,13 +163,15 @@ def test_openai_client_chat_completions_create__create_raises_an_error__span_and assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__openai_call_made_in_another_tracked_function__openai_span_attached_to_existing_trace( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -237,13 +243,15 @@ def f(): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__async_openai_call_made_in_another_tracked_async_function__openai_span_attached_to_existing_trace( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -315,13 +323,15 @@ async def async_f(): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__stream_mode_is_on__generator_tracked_correctly( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -395,13 +405,15 @@ def test_openai_client_chat_completions_create__stream_mode_is_on__generator_tra assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_openai_client_chat_completions_create__async_openai_call_made_in_another_tracked_async_function__streaming_mode_enabled__openai_span_attached_to_existing_trace( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -479,4 +491,4 @@ async def async_f(): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) diff --git a/sdks/python/tests/testlib/__init__.py b/sdks/python/tests/testlib/__init__.py index 0c6b9e6c18..30ca39068c 100644 --- a/sdks/python/tests/testlib/__init__.py +++ b/sdks/python/tests/testlib/__init__.py @@ -1,10 +1,14 @@ -from .fake_message_processor import FakeMessageProcessor -from .testlib_dsl import SpanModel, TraceModel, ANY_BUT_NONE, assert_traces_match +from .backend_emulator_message_processor import BackendEmulatorMessageProcessor +from .models import SpanModel, TraceModel +from .assert_helpers import assert_dicts_equal, prepare_difference_report, assert_equal +from .any_but_none import ANY_BUT_NONE __all__ = [ "SpanModel", "TraceModel", "ANY_BUT_NONE", - "assert_traces_match", - "FakeMessageProcessor", + "assert_equal", + "assert_dicts_equal", + "prepare_difference_report", + "BackendEmulatorMessageProcessor", ] diff --git a/sdks/python/tests/testlib/any_but_none.py b/sdks/python/tests/testlib/any_but_none.py new file mode 100644 index 0000000000..babbe715a8 --- /dev/null +++ b/sdks/python/tests/testlib/any_but_none.py @@ -0,0 +1,17 @@ +class AnyButNone: + "A helper object that compares equal to everything but None." + + def __eq__(self, other): + if other is None: + return False + + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return "" + + +ANY_BUT_NONE = AnyButNone() diff --git a/sdks/python/tests/testlib/assert_helpers.py b/sdks/python/tests/testlib/assert_helpers.py new file mode 100644 index 0000000000..e0b0d032f1 --- /dev/null +++ b/sdks/python/tests/testlib/assert_helpers.py @@ -0,0 +1,40 @@ +from typing import List, Any, Optional, Dict + +import logging +import mock +import deepdiff + +from . import any_but_none + +LOGGER = logging.getLogger(__name__) + + +def prepare_difference_report(expected: Any, actual: Any) -> str: + try: + diff_report = deepdiff.DeepDiff( + expected, actual, exclude_types=[any_but_none.AnyButNone, mock.mock._ANY] + ).pretty() + return diff_report + except Exception: + LOGGER.debug("Failed to prepare difference report", exc_info=True) + return "" + + +def assert_equal(expected, actual): + assert actual == expected, prepare_difference_report(actual, expected) + + +def assert_dicts_equal( + dict1: Dict[str, Any], + dict2: Dict[str, Any], + ignore_keys: Optional[List[str]] = None, +) -> bool: + dict1_copy, dict2_copy = {**dict1}, {**dict2} + + ignore_keys = [] if ignore_keys is None else ignore_keys + + for key in ignore_keys: + dict1_copy.pop(key, None) + dict2_copy.pop(key, None) + + assert dict1_copy == dict2_copy, prepare_difference_report(dict1_copy, dict2_copy) diff --git a/sdks/python/tests/testlib/fake_message_processor.py b/sdks/python/tests/testlib/backend_emulator_message_processor.py similarity index 96% rename from sdks/python/tests/testlib/fake_message_processor.py rename to sdks/python/tests/testlib/backend_emulator_message_processor.py index 8d3fa35889..db4ac90e05 100644 --- a/sdks/python/tests/testlib/fake_message_processor.py +++ b/sdks/python/tests/testlib/backend_emulator_message_processor.py @@ -1,10 +1,10 @@ from opik.message_processing import message_processors, messages from typing import List, Tuple, Type, Dict, Union -from .testlib_dsl import TraceModel, SpanModel, FeedbackScoreModel +from .models import TraceModel, SpanModel, FeedbackScoreModel -class FakeMessageProcessor(message_processors.BaseMessageProcessor): +class BackendEmulatorMessageProcessor(message_processors.BaseMessageProcessor): def __init__(self) -> None: self.processed_messages: List[messages.BaseMessage] = [] self.trace_trees: List[TraceModel] = [] diff --git a/sdks/python/tests/testlib/models.py b/sdks/python/tests/testlib/models.py new file mode 100644 index 0000000000..c4c6823e87 --- /dev/null +++ b/sdks/python/tests/testlib/models.py @@ -0,0 +1,47 @@ +from typing import List, Any, Optional, Dict + +import dataclasses +import datetime + + +@dataclasses.dataclass +class SpanModel: + id: str + start_time: datetime.datetime + name: Optional[str] = None + input: Any = None + output: Any = None + tags: Optional[List[str]] = None + metadata: Optional[Dict[str, Any]] = None + type: str = "general" + usage: Optional[Dict[str, Any]] = None + end_time: Optional[datetime.datetime] = None + spans: List["SpanModel"] = dataclasses.field(default_factory=list) + feedback_scores: List["FeedbackScoreModel"] = dataclasses.field( + default_factory=list + ) + + +@dataclasses.dataclass +class TraceModel: + id: str + start_time: datetime.datetime + name: Optional[str] + input: Any = None + output: Any = None + tags: Optional[List[str]] = None + metadata: Optional[Dict[str, Any]] = None + end_time: Optional[datetime.datetime] = None + spans: List["SpanModel"] = dataclasses.field(default_factory=list) + feedback_scores: List["FeedbackScoreModel"] = dataclasses.field( + default_factory=list + ) + + +@dataclasses.dataclass +class FeedbackScoreModel: + id: str + name: str + value: float + category_name: Optional[str] = None + reason: Optional[str] = None diff --git a/sdks/python/tests/testlib/testlib_dsl.py b/sdks/python/tests/testlib/testlib_dsl.py deleted file mode 100644 index e12b8d2243..0000000000 --- a/sdks/python/tests/testlib/testlib_dsl.py +++ /dev/null @@ -1,101 +0,0 @@ -from typing import List, Any, Optional, Dict - -import dataclasses -import logging -import unittest -import mock -import deepdiff -import datetime - - -LOGGER = logging.getLogger(__name__) - -# TODO: expand classes to have more attributes, current ones are considered to be -# a bare minimum for tests to check that traces have correct structure - - -@dataclasses.dataclass -class SpanModel: - id: str - start_time: datetime.datetime - name: Optional[str] = None - input: Any = None - output: Any = None - tags: Optional[List[str]] = None - metadata: Optional[Dict[str, Any]] = None - type: str = "general" - usage: Optional[Dict[str, Any]] = None - end_time: Optional[datetime.datetime] = None - spans: List["SpanModel"] = dataclasses.field(default_factory=list) - feedback_scores: List["FeedbackScoreModel"] = dataclasses.field( - default_factory=list - ) - - -@dataclasses.dataclass -class TraceModel: - id: str - start_time: datetime.datetime - name: Optional[str] - input: Any = None - output: Any = None - tags: Optional[List[str]] = None - metadata: Optional[Dict[str, Any]] = None - end_time: Optional[datetime.datetime] = None - spans: List["SpanModel"] = dataclasses.field(default_factory=list) - feedback_scores: List["FeedbackScoreModel"] = dataclasses.field( - default_factory=list - ) - - -@dataclasses.dataclass -class FeedbackScoreModel: - id: str - name: str - value: float - category_name: Optional[str] = None - reason: Optional[str] = None - - -class _AnyButNone: - "A helper object that compares equal to everything but None." - - def __eq__(self, other): - if other is None: - return False - - return True - - def __ne__(self, other): - return not self.__eq__(other) - - def __repr__(self): - return "" - - -def prepare_difference_report(expected: Any, actual: Any) -> str: - try: - diff_report = deepdiff.DeepDiff( - expected, actual, exclude_types=[_AnyButNone, mock.mock._ANY] - ).pretty() - return diff_report - except Exception: - LOGGER.debug("Failed to prepare difference report", exc_info=True) - return "" - - -def assert_traces_match(trace_expected, trace_actual): - trace_expected = trace_expected.__dict__ - trace_actual = trace_actual.__dict__ - - test_case = unittest.TestCase() - test_case.maxDiff = None - - test_case.assertDictEqual( - trace_expected, - trace_actual, - msg="\n" + prepare_difference_report(trace_expected, trace_actual), - ) - - -ANY_BUT_NONE = _AnyButNone() diff --git a/sdks/python/tests/unit/decorator/test_tracker_outputs.py b/sdks/python/tests/unit/decorator/test_tracker_outputs.py index 69edb9237b..7563d716ad 100644 --- a/sdks/python/tests/unit/decorator/test_tracker_outputs.py +++ b/sdks/python/tests/unit/decorator/test_tracker_outputs.py @@ -7,17 +7,19 @@ from opik import context_storage, opik_context from opik.api_objects import opik_client -from ...testlib import fake_message_processor +from ...testlib import backend_emulator_message_processor from ...testlib import ( SpanModel, TraceModel, ANY_BUT_NONE, - assert_traces_match, + assert_equal, ) def test_track__one_nested_function__happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -74,13 +76,15 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__one_nested_function__inputs_and_outputs_not_captured__inputs_and_outputs_initialized_with_Nones( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -122,13 +126,15 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__one_nested_function__output_is_dict__output_is_wrapped_by_tracker( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -170,11 +176,13 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__two_nested_functions__happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -246,13 +254,15 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__outer_function_has_two_separate_nested_function__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -323,11 +333,13 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__two_traces__happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -396,18 +408,16 @@ def f_2(x): assert len(fake_message_processor_.trace_trees) == 2 - assert_traces_match( - EXPECTED_TRACE_TREES[0], fake_message_processor_.trace_trees[0] - ) - assert_traces_match( - EXPECTED_TRACE_TREES[1], fake_message_processor_.trace_trees[1] - ) + assert_equal(EXPECTED_TRACE_TREES[0], fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREES[1], fake_message_processor_.trace_trees[1]) def test_track__one_function__error_raised__trace_and_span_finished_correctly__outputs_are_None( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -451,13 +461,15 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__one_async_function__error_raised__trace_and_span_finished_correctly__outputs_are_None( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -502,11 +514,13 @@ async def async_f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__nested_calls_in_separate_threads__3_traces_in_result(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -599,21 +613,17 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 3 - assert_traces_match( - EXPECTED_TRACE_TREES[0], fake_message_processor_.trace_trees[0] - ) - assert_traces_match( - EXPECTED_TRACE_TREES[1], fake_message_processor_.trace_trees[1] - ) - assert_traces_match( - EXPECTED_TRACE_TREES[2], fake_message_processor_.trace_trees[2] - ) + assert_equal(EXPECTED_TRACE_TREES[0], fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREES[1], fake_message_processor_.trace_trees[1]) + assert_equal(EXPECTED_TRACE_TREES[2], fake_message_processor_.trace_trees[2]) def test_track__single_generator_function_tracked__generator_exhausted__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -660,13 +670,15 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__generator_function_tracked__generator_exhausted_in_another_tracked_function__trace_tree_remains_correct( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -744,13 +756,15 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__single_async_function_tracked__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -794,13 +808,15 @@ async def async_f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__nested_async_function_tracked__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -859,13 +875,15 @@ async def async_f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__single_async_generator_function_tracked__generator_exhausted__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -915,13 +933,15 @@ async def async_generator_user(): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__async_generator_inside_another_tracked_function__happyflow( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -986,13 +1006,15 @@ async def async_generator_user(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__distributed_tracing_with_headers__tracing_is_performed_in_2_threads__all_data_is_saved_in_1_trace_tree( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -1061,13 +1083,15 @@ def f_outer(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) def test_track__trace_already_created_not_by_decorator__decorator_just_attaches_new_span_to_it__trace_is_not_popped_from_context_in_the_end( fake_streamer, ): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_construct_online_streamer = mock.Mock() @@ -1121,4 +1145,4 @@ def f(x): assert len(fake_message_processor_.trace_trees) == 1 - assert_traces_match(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) + assert_equal(EXPECTED_TRACE_TREE, fake_message_processor_.trace_trees[0]) diff --git a/sdks/python/tests/unit/evaluation/test_evaluate.py b/sdks/python/tests/unit/evaluation/test_evaluate.py index cb13cadddc..584f6e9194 100644 --- a/sdks/python/tests/unit/evaluation/test_evaluate.py +++ b/sdks/python/tests/unit/evaluation/test_evaluate.py @@ -3,18 +3,20 @@ from opik.api_objects import opik_client from opik import evaluation from opik.evaluation import metrics -from ...testlib import fake_message_processor -from ...testlib.testlib_dsl import ( +from ...testlib import backend_emulator_message_processor +from ...testlib.models import ( TraceModel, FeedbackScoreModel, ANY_BUT_NONE, - assert_traces_match, + assert_equal, ) from opik.message_processing import streamer_constructors def test_evaluate_happyflow(fake_streamer): - fake_message_processor_: fake_message_processor.FakeMessageProcessor + fake_message_processor_: ( + backend_emulator_message_processor.BackendEmulatorMessageProcessor + ) streamer, fake_message_processor_ = fake_streamer mock_dataset = mock.Mock() @@ -121,4 +123,4 @@ def say_task(dataset_item: dataset_item.DatasetItem): for expected_trace, actual_trace in zip( EXPECTED_TRACE_TREES, fake_message_processor_.trace_trees ): - assert_traces_match(expected_trace, actual_trace) + assert_equal(expected_trace, actual_trace) From 5cbe735ddd8b16309ed7b0d204b8dbca0b812459 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Fri, 6 Sep 2024 15:30:19 +0200 Subject: [PATCH 06/10] Refactor e2e tests, implement new feedback tests and experiment test --- sdks/python/examples/manual_chain_building.py | 4 +- sdks/python/src/opik/__init__.py | 2 + sdks/python/src/opik/evaluation/evaluator.py | 6 - sdks/python/tests/conftest.py | 2 +- sdks/python/tests/e2e/conftest.py | 29 +++- sdks/python/tests/e2e/test_dataset.py | 13 +- sdks/python/tests/e2e/test_experiment.py | 76 ++++++++++ sdks/python/tests/e2e/test_feedback_scores.py | 137 ++++++++++++++++++ .../tests/e2e/{verifiers => }/verifiers.py | 93 +++++++++++- sdks/python/tests/e2e/verifiers/__init__.py | 8 - 10 files changed, 340 insertions(+), 30 deletions(-) create mode 100644 sdks/python/tests/e2e/test_experiment.py create mode 100644 sdks/python/tests/e2e/test_feedback_scores.py rename sdks/python/tests/e2e/{verifiers => }/verifiers.py (52%) delete mode 100644 sdks/python/tests/e2e/verifiers/__init__.py diff --git a/sdks/python/examples/manual_chain_building.py b/sdks/python/examples/manual_chain_building.py index 515a6a3852..aa171cf9e9 100644 --- a/sdks/python/examples/manual_chain_building.py +++ b/sdks/python/examples/manual_chain_building.py @@ -1,8 +1,10 @@ import opik +import os +os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" client = opik.Opik() -trace = client.trace(name="trace-name") +trace = client.trace() span1 = trace.span(name="span-1") span2 = span1.span(name="span-2") span2.end() diff --git a/sdks/python/src/opik/__init__.py b/sdks/python/src/opik/__init__.py index 490468b1a4..9313338104 100644 --- a/sdks/python/src/opik/__init__.py +++ b/sdks/python/src/opik/__init__.py @@ -7,7 +7,9 @@ from . import _logging from . import package_version from .plugins.pytest.decorator import llm_unit +import os +os.environ["OPIK_FILE_LOGGING_LEVEL"] = "DEBUG" _logging.setup() __version__ = package_version.VERSION diff --git a/sdks/python/src/opik/evaluation/evaluator.py b/sdks/python/src/opik/evaluation/evaluator.py index 2582ba83f9..da44c5325b 100644 --- a/sdks/python/src/opik/evaluation/evaluator.py +++ b/sdks/python/src/opik/evaluation/evaluator.py @@ -17,7 +17,6 @@ def evaluate( experiment_name: str, verbose: int = 1, task_threads: int = 16, - scoring_threads: int = 16, ) -> List[test_result.TestResult]: """ Performs task evaluation on a given dataset. @@ -38,11 +37,6 @@ def evaluate( threads are created, all tasks executed in the current thread sequentially. are executed sequentially in the current thread. Use more than 1 worker if your task object is compatible with sharing across threads. - - scoring_threads: amount of thread workers to compute metric scores. If set to 1, - no additional threads are created, all metrics are computed in the - current thread sequentially. - Use more than 1 worker if your metrics are compatible with sharing across threads. """ client = opik_client.get_client_cached() start_time = time.time() diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py index 8f388da27c..26358f8ba9 100644 --- a/sdks/python/tests/conftest.py +++ b/sdks/python/tests/conftest.py @@ -12,7 +12,7 @@ def clear_context_storage(): @pytest.fixture(autouse=True) -def shutdown_cached_client(): +def shutdown_cached_client_after_test(): yield if opik_client.get_client_cached.cache_info().currsize > 0: opik_client.get_client_cached().end() diff --git a/sdks/python/tests/e2e/conftest.py b/sdks/python/tests/e2e/conftest.py index 56618e48bc..bf792d3c82 100644 --- a/sdks/python/tests/e2e/conftest.py +++ b/sdks/python/tests/e2e/conftest.py @@ -1,18 +1,37 @@ import os +import random +import string +import opik import opik.api_objects.opik_client import pytest +def _random_chars(n: int = 6) -> str: + return "".join(random.choice(string.ascii_letters) for _ in range(n)) + + @pytest.fixture(scope="session") def configure_e2e_tests_env(): - os.environ["OPIK_PROJECT_NAME"] = "e2e-tests" + os.environ["OPIK_PROJECT_NAME"] = ( + "e2e-tests" # -{opik.__version__}-{_random_chars()}" + ) os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" -@pytest.fixture(scope="session") -def opik_client(configure_e2e_tests_env): - opik_client_ = opik.api_objects.opik_client.get_client_cached() +@pytest.fixture() +def opik_client(configure_e2e_tests_env, shutdown_cached_client_after_test): + opik_client_ = opik.api_objects.opik_client.Opik() + + yield opik_client_ + + opik_client_.end() + + +@pytest.fixture +def dataset_name(opik_client: opik.Opik): + name = f"e2e-tests-dataset-{ _random_chars()}" + yield name - return opik_client_ + opik_client.delete_dataset(name) diff --git a/sdks/python/tests/e2e/test_dataset.py b/sdks/python/tests/e2e/test_dataset.py index c1f4956793..cb4fe16bdc 100644 --- a/sdks/python/tests/e2e/test_dataset.py +++ b/sdks/python/tests/e2e/test_dataset.py @@ -1,17 +1,14 @@ import opik -import random -import string from . import verifiers from opik.api_objects.dataset import dataset_item -def test_create_and_populate_dataset__happyflow(opik_client: opik.Opik): - DATASET_NAME = "e2e-tests-dataset-".join( - random.choice(string.ascii_letters) for _ in range(6) - ) +def test_create_and_populate_dataset__happyflow( + opik_client: opik.Opik, dataset_name: str +): DESCRIPTION = "E2E test dataset" - dataset = opik_client.create_dataset(DATASET_NAME, description=DESCRIPTION) + dataset = opik_client.create_dataset(dataset_name, description=DESCRIPTION) dataset.insert( [ @@ -47,7 +44,7 @@ def test_create_and_populate_dataset__happyflow(opik_client: opik.Opik): verifiers.verify_dataset( opik_client=opik_client, - name=DATASET_NAME, + name=dataset_name, description=DESCRIPTION, dataset_items=EXPECTED_DATASET_ITEMS, ) diff --git a/sdks/python/tests/e2e/test_experiment.py b/sdks/python/tests/e2e/test_experiment.py new file mode 100644 index 0000000000..ab8f5f6ddc --- /dev/null +++ b/sdks/python/tests/e2e/test_experiment.py @@ -0,0 +1,76 @@ +import opik +import random +import string + +import opik.evaluation +from opik.api_objects.dataset import dataset_item +from opik.evaluation import metrics +import pytest + + +@pytest.fixture +def experiment_name(opik_client: opik.Opik): + name = "e2e-tests-experiment-".join( + random.choice(string.ascii_letters) for _ in range(6) + ) + yield name + + +def test_experiment_creation_via_evaluate_function__happyflow( + opik_client: opik.Opik, dataset_name: str, experiment_name: str +): + # TODO: this test is not finished, it only checks that the script is not failing + + dataset = opik_client.create_dataset(dataset_name) + + dataset.insert( + [ + { + "input": {"question": "What is the of capital of France?"}, + "expected_output": {"output": "Paris"}, + }, + { + "input": {"question": "What is the of capital of Germany?"}, + "expected_output": {"output": "Berlin"}, + }, + { + "input": {"question": "What is the of capital of Poland?"}, + "expected_output": {"output": "Warsaw"}, + }, + ] + ) + + def task(item: dataset_item.DatasetItem): + if item.input == {"question": "What is the of capital of France?"}: + return {"output": "Paris", "reference": item.expected_output["output"]} + if item.input == {"question": "What is the of capital of Germany?"}: + return {"output": "Berlin", "reference": item.expected_output["output"]} + if item.input == {"question": "What is the of capital of Poland?"}: + return {"output": "Krakow", "reference": item.expected_output["output"]} + + raise AssertionError( + f"Task received dataset item with an unexpected input: {item.input}" + ) + + equals_metric = metrics.Equals() + opik.evaluation.evaluate( + dataset=dataset, + task=task, + scoring_metrics=[equals_metric], + experiment_name=experiment_name, + ) + + # EXPECTED_DATASET_ITEMS = [ + # dataset_item.DatasetItem( + # input={"question": "What is the of capital of France?"}, + # expected_output={"output": "Paris"}, + # ), + # dataset_item.DatasetItem( + # input={"question": "What is the of capital of Germany?"}, + # expected_output={"output": "Berlin"}, + # ), + # dataset_item.DatasetItem( + # input={"question": "What is the of capital of Poland?"}, + # expected_output={"output": "Warsaw"}, + # ), + # ] diff --git a/sdks/python/tests/e2e/test_feedback_scores.py b/sdks/python/tests/e2e/test_feedback_scores.py new file mode 100644 index 0000000000..0a0f295d37 --- /dev/null +++ b/sdks/python/tests/e2e/test_feedback_scores.py @@ -0,0 +1,137 @@ +from typing import List +import opik +from opik.types import FeedbackScoreDict +from . import verifiers + + +def test_feedbacks_are_logged_via_trace_and_span__happyflow(opik_client: opik.Opik): + trace = opik_client.trace( + name="trace-name", + ) + trace.log_feedback_score( + "trace-metric-1", value=0.5, category_name="category-1", reason="some-reason-1" + ) + trace.log_feedback_score( + "trace-metric-2", value=0.95, category_name="category-2", reason="some-reason-2" + ) + + span = trace.span( + name="span-name", + ) + span.log_feedback_score( + "span-metric-1", value=0.75, category_name="category-3", reason="some-reason-3" + ) + span.log_feedback_score( + "span-metric-2", value=0.25, category_name="category-4", reason="some-reason-4" + ) + span.end() + trace.end() + + opik_client.flush() + + EXPECTED_TRACE_FEEDBACK_SCORES: List[FeedbackScoreDict] = [ + { + "id": trace.id, + "name": "trace-metric-1", + "value": 0.5, + "category_name": "category-1", + "reason": "some-reason-1", + }, + { + "id": trace.id, + "name": "trace-metric-2", + "value": 0.95, + "category_name": "category-2", + "reason": "some-reason-2", + }, + ] + + EXPECTED_SPAN_FEEDBACK_SCORES: List[FeedbackScoreDict] = [ + { + "id": span.id, + "name": "span-metric-1", + "value": 0.75, + "category_name": "category-3", + "reason": "some-reason-3", + }, + { + "id": span.id, + "name": "span-metric-2", + "value": 0.25, + "category_name": "category-4", + "reason": "some-reason-4", + }, + ] + verifiers.verify_trace( + opik_client=opik_client, + trace_id=trace.id, + name="trace-name", + feedback_scores=EXPECTED_TRACE_FEEDBACK_SCORES, + ) + verifiers.verify_span( + opik_client=opik_client, + span_id=span.id, + trace_id=span.trace_id, + parent_span_id=None, + name="span-name", + feedback_scores=EXPECTED_SPAN_FEEDBACK_SCORES, + ) + + +def test_feedbacks_are_logged_via_client__happyflow(opik_client: opik.Opik): + trace = opik_client.trace(name="trace-name-1") + span = trace.span(name="span-name-1") + + EXPECTED_TRACE_FEEDBACK_SCORES: List[FeedbackScoreDict] = [ + { + "id": trace.id, + "name": "trace-metric-1", + "value": 0.5, + "category_name": "category-1", + "reason": "some-reason-1", + }, + { + "id": trace.id, + "name": "trace-metric-2", + "value": 0.95, + "category_name": "category-2", + "reason": "some-reason-2", + }, + ] + + EXPECTED_SPAN_FEEDBACK_SCORES: List[FeedbackScoreDict] = [ + { + "id": span.id, + "name": "span-metric-1", + "value": 0.75, + "category_name": "category-3", + "reason": "some-reason-3", + }, + { + "id": span.id, + "name": "span-metric-2", + "value": 0.25, + "category_name": "category-4", + "reason": "some-reason-4", + }, + ] + + opik_client.log_spans_feedback_scores(scores=EXPECTED_SPAN_FEEDBACK_SCORES) + opik_client.log_traces_feedback_scores(scores=EXPECTED_TRACE_FEEDBACK_SCORES) + + opik_client.flush() + + verifiers.verify_trace( + opik_client=opik_client, + trace_id=trace.id, + name="trace-name-1", + feedback_scores=EXPECTED_TRACE_FEEDBACK_SCORES, + ) + verifiers.verify_span( + opik_client=opik_client, + span_id=span.id, + trace_id=span.trace_id, + parent_span_id=None, + name="span-name-1", + feedback_scores=EXPECTED_SPAN_FEEDBACK_SCORES, + ) diff --git a/sdks/python/tests/e2e/verifiers/verifiers.py b/sdks/python/tests/e2e/verifiers.py similarity index 52% rename from sdks/python/tests/e2e/verifiers/verifiers.py rename to sdks/python/tests/e2e/verifiers.py index ec3cfd6f13..2ffed04dba 100644 --- a/sdks/python/tests/e2e/verifiers/verifiers.py +++ b/sdks/python/tests/e2e/verifiers.py @@ -2,10 +2,11 @@ import opik import json +from opik.types import FeedbackScoreDict from opik.api_objects.dataset import dataset_item from opik import synchronization -from ... import testlib +from .. import testlib import mock @@ -17,6 +18,7 @@ def verify_trace( input: Dict[str, Any] = mock.ANY, # type: ignore output: Dict[str, Any] = mock.ANY, # type: ignore tags: List[str] = mock.ANY, # type: ignore + feedback_scores: List[FeedbackScoreDict] = mock.ANY, # type: ignore ): if not synchronization.until( lambda: (opik_client.get_trace_content(id=trace_id) is not None), @@ -36,6 +38,36 @@ def verify_trace( ) assert trace.tags == tags, testlib.prepare_difference_report(trace.tags, tags) + if feedback_scores is not mock.ANY: + actual_feedback_scores = ( + [] if trace.feedback_scores is None else trace.feedback_scores + ) + assert ( + len(actual_feedback_scores) == len(feedback_scores) + ), f"Expected amount of trace feedback scores ({len(feedback_scores)}) is not equal to actual amount ({len(actual_feedback_scores)})" + + actual_feedback_scores: List[FeedbackScoreDict] = [ + { + "category_name": score.category_name, + "id": trace_id, + "name": score.name, + "reason": score.reason, + "value": score.value, + } + for score in trace.feedback_scores + ] + + sorted_actual_feedback_scores = sorted( + actual_feedback_scores, key=lambda item: json.dumps(item, sort_keys=True) + ) + sorted_expected_feedback_scores = sorted( + feedback_scores, key=lambda item: json.dumps(item, sort_keys=True) + ) + for actual_score, expected_score in zip( + sorted_actual_feedback_scores, sorted_expected_feedback_scores + ): + testlib.assert_dicts_equal(actual_score, expected_score) + def verify_span( opik_client: opik.Opik, @@ -48,6 +80,7 @@ def verify_span( output: Dict[str, Any] = mock.ANY, # type: ignore tags: List[str] = mock.ANY, # type: ignore type: str = mock.ANY, # type: ignore + feedback_scores: List[FeedbackScoreDict] = mock.ANY, # type: ignore, ): if not synchronization.until( lambda: (opik_client.get_span_content(id=span_id) is not None), @@ -76,6 +109,36 @@ def verify_span( ) assert span.tags == tags, testlib.prepare_difference_report(span.tags, tags) + if feedback_scores is not mock.ANY: + actual_feedback_scores = ( + [] if span.feedback_scores is None else span.feedback_scores + ) + assert ( + len(actual_feedback_scores) == len(feedback_scores) + ), f"Expected amount of span feedback scores ({len(feedback_scores)}) is not equal to actual amount ({len(actual_feedback_scores)})" + + actual_feedback_scores: List[FeedbackScoreDict] = [ + { + "category_name": score.category_name, + "id": span_id, + "name": score.name, + "reason": score.reason, + "value": score.value, + } + for score in span.feedback_scores + ] + + sorted_actual_feedback_scores = sorted( + actual_feedback_scores, key=lambda item: json.dumps(item, sort_keys=True) + ) + sorted_expected_feedback_scores = sorted( + feedback_scores, key=lambda item: json.dumps(item, sort_keys=True) + ) + for actual_score, expected_score in zip( + sorted_actual_feedback_scores, sorted_expected_feedback_scores + ): + testlib.assert_dicts_equal(actual_score, expected_score) + def verify_dataset( opik_client: opik.Opik, @@ -109,3 +172,31 @@ def verify_dataset( for actual_item, expected_item in zip(sorted_actual_items, sorted_expected_items): testlib.assert_dicts_equal(actual_item, expected_item, ignore_keys=["id"]) + + +def verify_experiment( + opik_client: opik.Opik, + id: str, + experiment_name: str, + dataset_name: str, + feedback_scores_amount: int, + traces_amount: int, +): + rest_client = ( + opik_client._rest_client + ) # temporary solution until backend prepares proper endpoints + + rest_client.datasets.find_dataset_items_with_experiment_items + + if not synchronization.until( + lambda: (rest_client.experiments.get_experiment_by_id(id) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get experiment with id {id}.") + + experiment_content = rest_client.experiments.get_experiment_by_id(id) + + assert experiment_content.name == experiment_name + assert len(experiment_content.feedback_scores) == feedback_scores_amount + assert len(experiment_content.trace_count) == traces_amount + assert experiment_content.dataset_id == opik_client.get_dataset(dataset_name).name diff --git a/sdks/python/tests/e2e/verifiers/__init__.py b/sdks/python/tests/e2e/verifiers/__init__.py deleted file mode 100644 index b83846bb52..0000000000 --- a/sdks/python/tests/e2e/verifiers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .verifiers import verify_trace, verify_span, verify_dataset - - -__all__ = [ - "verify_trace", - "verify_span", - "verify_dataset", -] From 367ee7d311aab1d3e000a4b4655624b28c935f24 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Fri, 6 Sep 2024 15:40:58 +0200 Subject: [PATCH 07/10] Remove debug statement from opik.__init__, add evaluate to __all__ --- sdks/python/src/opik/__init__.py | 4 +-- sdks/python/tests/e2e/verifiers.py | 52 +++++++++++++++--------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/sdks/python/src/opik/__init__.py b/sdks/python/src/opik/__init__.py index 9313338104..9845bcc657 100644 --- a/sdks/python/src/opik/__init__.py +++ b/sdks/python/src/opik/__init__.py @@ -7,14 +7,14 @@ from . import _logging from . import package_version from .plugins.pytest.decorator import llm_unit -import os +from .evaluation import evaluate -os.environ["OPIK_FILE_LOGGING_LEVEL"] = "DEBUG" _logging.setup() __version__ = package_version.VERSION __all__ = [ "__version__", + "evaluate", "track", "flush_tracker", "Opik", diff --git a/sdks/python/tests/e2e/verifiers.py b/sdks/python/tests/e2e/verifiers.py index 2ffed04dba..04ee19c92f 100644 --- a/sdks/python/tests/e2e/verifiers.py +++ b/sdks/python/tests/e2e/verifiers.py @@ -174,29 +174,29 @@ def verify_dataset( testlib.assert_dicts_equal(actual_item, expected_item, ignore_keys=["id"]) -def verify_experiment( - opik_client: opik.Opik, - id: str, - experiment_name: str, - dataset_name: str, - feedback_scores_amount: int, - traces_amount: int, -): - rest_client = ( - opik_client._rest_client - ) # temporary solution until backend prepares proper endpoints - - rest_client.datasets.find_dataset_items_with_experiment_items - - if not synchronization.until( - lambda: (rest_client.experiments.get_experiment_by_id(id) is not None), - allow_errors=True, - ): - raise AssertionError(f"Failed to get experiment with id {id}.") - - experiment_content = rest_client.experiments.get_experiment_by_id(id) - - assert experiment_content.name == experiment_name - assert len(experiment_content.feedback_scores) == feedback_scores_amount - assert len(experiment_content.trace_count) == traces_amount - assert experiment_content.dataset_id == opik_client.get_dataset(dataset_name).name +# def verify_experiment( +# opik_client: opik.Opik, +# id: str, +# experiment_name: str, +# dataset_name: str, +# feedback_scores_amount: int, +# traces_amount: int, +# ): +# rest_client = ( +# opik_client._rest_client +# ) # temporary solution until backend prepares proper endpoints + +# rest_client.datasets.find_dataset_items_with_experiment_items + +# if not synchronization.until( +# lambda: (rest_client.experiments.get_experiment_by_id(id) is not None), +# allow_errors=True, +# ): +# raise AssertionError(f"Failed to get experiment with id {id}.") + +# experiment_content = rest_client.experiments.get_experiment_by_id(id) + +# assert experiment_content.name == experiment_name +# assert len(experiment_content.feedback_scores) == feedback_scores_amount +# assert len(experiment_content.trace_count) == traces_amount +# assert experiment_content.dataset_id == opik_client.get_dataset(dataset_name).name From 14c3def995fdd42ae495e60a0a29750f861d0bfe Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Fri, 6 Sep 2024 17:14:09 +0200 Subject: [PATCH 08/10] Update experiment e2e test, add EvaluationResult object as return value of evaluate --- .../opik/api_objects/experiment/experiment.py | 8 +-- .../src/opik/evaluation/evaluation_result.py | 12 ++++ sdks/python/src/opik/evaluation/evaluator.py | 12 +++- sdks/python/tests/e2e/conftest.py | 8 +++ sdks/python/tests/e2e/test_experiment.py | 27 ++++---- sdks/python/tests/e2e/verifiers.py | 66 +++++++++++-------- 6 files changed, 87 insertions(+), 46 deletions(-) create mode 100644 sdks/python/src/opik/evaluation/evaluation_result.py diff --git a/sdks/python/src/opik/api_objects/experiment/experiment.py b/sdks/python/src/opik/api_objects/experiment/experiment.py index 782da46eac..c27b0b9dae 100644 --- a/sdks/python/src/opik/api_objects/experiment/experiment.py +++ b/sdks/python/src/opik/api_objects/experiment/experiment.py @@ -18,16 +18,16 @@ def __init__( dataset_name: str, rest_client: rest_api_client.OpikApi, ) -> None: - self._id = id - self._name = name - self._dataset_name = dataset_name + self.id = id + self.name = name + self.dataset_name = dataset_name self._rest_client = rest_client def insert(self, experiment_items: List[experiment_item.ExperimentItem]) -> None: rest_experiment_items = [ rest_experiment_item.ExperimentItem( id=item.id if item.id is not None else helpers.generate_id(), - experiment_id=self._id, + experiment_id=self.id, dataset_item_id=item.dataset_item_id, trace_id=item.trace_id, ) diff --git a/sdks/python/src/opik/evaluation/evaluation_result.py b/sdks/python/src/opik/evaluation/evaluation_result.py new file mode 100644 index 0000000000..09a8108645 --- /dev/null +++ b/sdks/python/src/opik/evaluation/evaluation_result.py @@ -0,0 +1,12 @@ +from typing import List + +import dataclasses + +from . import test_result + + +@dataclasses.dataclass +class EvaluationResult: + experiment_id: str + experiment_name: str + test_results: List[test_result.TestResult] diff --git a/sdks/python/src/opik/evaluation/evaluator.py b/sdks/python/src/opik/evaluation/evaluator.py index da44c5325b..94c1e6dfff 100644 --- a/sdks/python/src/opik/evaluation/evaluator.py +++ b/sdks/python/src/opik/evaluation/evaluator.py @@ -7,7 +7,7 @@ from ..api_objects.experiment import experiment_item from ..api_objects import opik_client -from . import tasks_scorer, test_result, scores_logger, report +from . import tasks_scorer, scores_logger, report, evaluation_result def evaluate( @@ -17,7 +17,7 @@ def evaluate( experiment_name: str, verbose: int = 1, task_threads: int = 16, -) -> List[test_result.TestResult]: +) -> evaluation_result.EvaluationResult: """ Performs task evaluation on a given dataset. @@ -71,4 +71,10 @@ def evaluate( experiment.insert(experiment_items=experiment_items) client.flush() - return test_results + + evaluation_result_ = evaluation_result.EvaluationResult( + experiment_id=experiment.id, + experiment_name=experiment.name, + test_results=test_results, + ) + return evaluation_result_ diff --git a/sdks/python/tests/e2e/conftest.py b/sdks/python/tests/e2e/conftest.py index bf792d3c82..7380647ad6 100644 --- a/sdks/python/tests/e2e/conftest.py +++ b/sdks/python/tests/e2e/conftest.py @@ -35,3 +35,11 @@ def dataset_name(opik_client: opik.Opik): yield name opik_client.delete_dataset(name) + + +@pytest.fixture +def experiment_name(opik_client: opik.Opik): + name = f"e2e-tests-experiment-{ _random_chars()}" + yield name + + # TODO: delete the experiment diff --git a/sdks/python/tests/e2e/test_experiment.py b/sdks/python/tests/e2e/test_experiment.py index ab8f5f6ddc..cc4cdcbb1e 100644 --- a/sdks/python/tests/e2e/test_experiment.py +++ b/sdks/python/tests/e2e/test_experiment.py @@ -1,19 +1,8 @@ import opik -import random -import string -import opik.evaluation from opik.api_objects.dataset import dataset_item from opik.evaluation import metrics -import pytest - - -@pytest.fixture -def experiment_name(opik_client: opik.Opik): - name = "e2e-tests-experiment-".join( - random.choice(string.ascii_letters) for _ in range(6) - ) - yield name +from . import verifiers def test_experiment_creation_via_evaluate_function__happyflow( @@ -53,13 +42,25 @@ def task(item: dataset_item.DatasetItem): ) equals_metric = metrics.Equals() - opik.evaluation.evaluate( + evaluation_result = opik.evaluate( dataset=dataset, task=task, scoring_metrics=[equals_metric], experiment_name=experiment_name, ) + opik.flush_tracker() + + verifiers.verify_experiment( + opik_client=opik_client, + id=evaluation_result.experiment_id, + experiment_name=evaluation_result.experiment_name, + traces_amount=3, # one trace per dataset item + feedback_scores_amount=1, # an average value of all Equals metric scores + ) + + # TODO: check more content of the experiment + # # EXPECTED_DATASET_ITEMS = [ # dataset_item.DatasetItem( # input={"question": "What is the of capital of France?"}, diff --git a/sdks/python/tests/e2e/verifiers.py b/sdks/python/tests/e2e/verifiers.py index 04ee19c92f..ea9dae88f4 100644 --- a/sdks/python/tests/e2e/verifiers.py +++ b/sdks/python/tests/e2e/verifiers.py @@ -174,29 +174,43 @@ def verify_dataset( testlib.assert_dicts_equal(actual_item, expected_item, ignore_keys=["id"]) -# def verify_experiment( -# opik_client: opik.Opik, -# id: str, -# experiment_name: str, -# dataset_name: str, -# feedback_scores_amount: int, -# traces_amount: int, -# ): -# rest_client = ( -# opik_client._rest_client -# ) # temporary solution until backend prepares proper endpoints - -# rest_client.datasets.find_dataset_items_with_experiment_items - -# if not synchronization.until( -# lambda: (rest_client.experiments.get_experiment_by_id(id) is not None), -# allow_errors=True, -# ): -# raise AssertionError(f"Failed to get experiment with id {id}.") - -# experiment_content = rest_client.experiments.get_experiment_by_id(id) - -# assert experiment_content.name == experiment_name -# assert len(experiment_content.feedback_scores) == feedback_scores_amount -# assert len(experiment_content.trace_count) == traces_amount -# assert experiment_content.dataset_id == opik_client.get_dataset(dataset_name).name +def verify_experiment( + opik_client: opik.Opik, + id: str, + experiment_name: str, + feedback_scores_amount: int, + traces_amount: int, +): + rest_client = ( + opik_client._rest_client + ) # temporary solution until backend prepares proper endpoints + + rest_client.datasets.find_dataset_items_with_experiment_items + + if not synchronization.until( + lambda: (rest_client.experiments.get_experiment_by_id(id) is not None), + allow_errors=True, + ): + raise AssertionError(f"Failed to get experiment with id {id}.") + + experiment_content = rest_client.experiments.get_experiment_by_id(id) + + assert ( + experiment_content.name == experiment_name + ), f"{experiment_content.name} != {experiment_name}" + + actual_scores_count = ( + 0 + if experiment_content.feedback_scores is None + else len(experiment_content.feedback_scores) + ) + assert ( + actual_scores_count == feedback_scores_amount + ), f"{actual_scores_count} != {feedback_scores_amount}" + + actual_trace_count = ( + 0 if experiment_content.trace_count is None else experiment_content.trace_count + ) + assert ( + actual_trace_count == traces_amount + ), f"{actual_trace_count} != {traces_amount}" From 872aeaf1444ef7f19496f82279a0b490403c41c7 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Fri, 6 Sep 2024 17:15:31 +0200 Subject: [PATCH 09/10] Make e2e tests run agains any cpnfigured backend --- sdks/python/tests/e2e/conftest.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sdks/python/tests/e2e/conftest.py b/sdks/python/tests/e2e/conftest.py index 7380647ad6..ef229c485c 100644 --- a/sdks/python/tests/e2e/conftest.py +++ b/sdks/python/tests/e2e/conftest.py @@ -14,10 +14,8 @@ def _random_chars(n: int = 6) -> str: @pytest.fixture(scope="session") def configure_e2e_tests_env(): - os.environ["OPIK_PROJECT_NAME"] = ( - "e2e-tests" # -{opik.__version__}-{_random_chars()}" - ) - os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" + os.environ["OPIK_PROJECT_NAME"] = "e2e-tests" + #os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" @pytest.fixture() From 57b9468558386c959adb3d49cb62d16533f20fa6 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik Date: Fri, 6 Sep 2024 17:33:53 +0200 Subject: [PATCH 10/10] Fix import error --- sdks/python/tests/e2e/conftest.py | 2 +- sdks/python/tests/unit/evaluation/test_evaluate.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sdks/python/tests/e2e/conftest.py b/sdks/python/tests/e2e/conftest.py index ef229c485c..ebdbe52262 100644 --- a/sdks/python/tests/e2e/conftest.py +++ b/sdks/python/tests/e2e/conftest.py @@ -15,7 +15,7 @@ def _random_chars(n: int = 6) -> str: @pytest.fixture(scope="session") def configure_e2e_tests_env(): os.environ["OPIK_PROJECT_NAME"] = "e2e-tests" - #os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" + # os.environ["OPIK_URL_OVERRIDE"] = "http://localhost:5173/api" @pytest.fixture() diff --git a/sdks/python/tests/unit/evaluation/test_evaluate.py b/sdks/python/tests/unit/evaluation/test_evaluate.py index 584f6e9194..3a4f52556f 100644 --- a/sdks/python/tests/unit/evaluation/test_evaluate.py +++ b/sdks/python/tests/unit/evaluation/test_evaluate.py @@ -3,12 +3,10 @@ from opik.api_objects import opik_client from opik import evaluation from opik.evaluation import metrics -from ...testlib import backend_emulator_message_processor +from ...testlib import backend_emulator_message_processor, ANY_BUT_NONE, assert_equal from ...testlib.models import ( TraceModel, FeedbackScoreModel, - ANY_BUT_NONE, - assert_equal, ) from opik.message_processing import streamer_constructors @@ -70,7 +68,6 @@ def say_task(dataset_item: dataset_item.DatasetItem): experiment_name="the-experiment-name", scoring_metrics=[metrics.Equals()], task_threads=1, - scoring_threads=1, ) mock_create_experiment.assert_called_once_with(