From ccdd0f3d2438000b8975c0d4aa6a5841b7f700d0 Mon Sep 17 00:00:00 2001 From: Andres Cruz Date: Wed, 18 Dec 2024 17:29:41 +0100 Subject: [PATCH 1/2] OPIK-556: Add embedded python service POC --- .../opik-embedded-python-backend/README.md | 23 ++ .../requirements.txt | 61 +++++ .../src/opik_embedded_backend/__init__.py | 21 ++ .../src/opik_embedded_backend/evaluator.py | 88 +++++++ .../helpers/id_helpers.py | 5 + .../tests/conftest.py | 27 ++ .../tests/test_evaluator.py | 246 ++++++++++++++++++ .../tests/test_requirements.txt | 1 + 8 files changed, 472 insertions(+) create mode 100644 apps/opik-backend/opik-embedded-python-backend/README.md create mode 100644 apps/opik-backend/opik-embedded-python-backend/requirements.txt create mode 100644 apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/__init__.py create mode 100644 apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/evaluator.py create mode 100644 apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/helpers/id_helpers.py create mode 100644 apps/opik-backend/opik-embedded-python-backend/tests/conftest.py create mode 100644 apps/opik-backend/opik-embedded-python-backend/tests/test_evaluator.py create mode 100644 apps/opik-backend/opik-embedded-python-backend/tests/test_requirements.txt diff --git a/apps/opik-backend/opik-embedded-python-backend/README.md b/apps/opik-backend/opik-embedded-python-backend/README.md new file mode 100644 index 0000000000..f0f5c2b942 --- /dev/null +++ b/apps/opik-backend/opik-embedded-python-backend/README.md @@ -0,0 +1,23 @@ +# Opik Embedded Python Backend + +## Requirements + +- Install Python: at least the minimum version compatible with the Opik Python SDK. +- Create and enable a Python virtual environment. +- Install all dependencies from `requirements.txt`. +- For running tests, also install dependencies from `tests/test_requirements.txt`. + +## Running the Flask service + +> [!TIP] +> Run in debug mode for development purposes, it reloads the code automatically. + +- From `apps/opik-backend/opik-embedded-python-backend` directory. +- Run the `opik_embedded_backend` module. +- Debug mode is enabled with `--debug`. + +```bash +flask --app src/opik_embedded_backend --debug run +``` + +Service is reachable at: `http://localhost:5000` diff --git a/apps/opik-backend/opik-embedded-python-backend/requirements.txt b/apps/opik-backend/opik-embedded-python-backend/requirements.txt new file mode 100644 index 0000000000..83ccf1f693 --- /dev/null +++ b/apps/opik-backend/opik-embedded-python-backend/requirements.txt @@ -0,0 +1,61 @@ +aiohappyeyeballs==2.4.4 +aiohttp==3.11.11 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.7.0 +attrs==24.3.0 +blinker==1.9.0 +certifi==2024.12.14 +charset-normalizer==3.4.0 +click==8.1.7 +distro==1.9.0 +filelock==3.16.1 +Flask==3.1.0 +frozenlist==1.5.0 +fsspec==2024.12.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.27.2 +huggingface-hub==0.27.0 +idna==3.10 +importlib_metadata==8.5.0 +iniconfig==2.0.0 +itsdangerous==2.2.0 +Jinja2==3.1.4 +jiter==0.8.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +Levenshtein==0.26.1 +litellm==1.55.7 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +multidict==6.1.0 +openai==1.58.1 +opik==1.3.0 +packaging==24.2 +pluggy==1.5.0 +propcache==0.2.1 +pydantic==2.10.4 +pydantic-settings==2.7.0 +pydantic_core==2.27.2 +Pygments==2.18.0 +python-dotenv==1.0.1 +PyYAML==6.0.2 +RapidFuzz==3.11.0 +referencing==0.35.1 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.22.3 +sniffio==1.3.1 +tenacity==9.0.0 +tiktoken==0.8.0 +tokenizers==0.21.0 +tqdm==4.67.1 +typing_extensions==4.12.2 +urllib3==2.2.3 +uuid7==0.1.0 +Werkzeug==3.1.3 +yarl==1.18.3 +zipp==3.21.0 diff --git a/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/__init__.py b/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/__init__.py new file mode 100644 index 0000000000..c890aed9a5 --- /dev/null +++ b/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/__init__.py @@ -0,0 +1,21 @@ +import logging + +from flask import Flask + +logger = logging.getLogger(__name__) + + +def create_app(test_config=None): + app = Flask(__name__, instance_relative_config=True) + + if test_config is None: + # load the instance config, if it exists, when not testing + app.config.from_pyfile('config.py', silent=True) + else: + # load the test config if passed in + app.config.from_mapping(test_config) + + from opik_embedded_backend.evaluator import evaluator + app.register_blueprint(evaluator) + + return app diff --git a/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/evaluator.py b/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/evaluator.py new file mode 100644 index 0000000000..3b1f81e032 --- /dev/null +++ b/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/evaluator.py @@ -0,0 +1,88 @@ +import inspect +from types import ModuleType +from typing import Type, Union, List, Any, Dict + +from flask import request, abort, jsonify, Blueprint, current_app +from opik.evaluation.metrics import BaseMetric +from opik.evaluation.metrics.score_result import ScoreResult +from werkzeug.exceptions import HTTPException + +from .helpers.id_helpers import uuid4_str + +evaluator = Blueprint('evaluator', __name__, url_prefix='/v1/private/evaluators') + + +def get_module(code: str, module_name: str = uuid4_str()) -> ModuleType: + module: ModuleType = ModuleType(module_name) + exec(code, module.__dict__) + return module + + +def get_metric_class(module: ModuleType) -> Type[BaseMetric]: + for _, cls in inspect.getmembers(module, inspect.isclass): + if issubclass(cls, BaseMetric): + return cls + + +def evaluate_metric(metric_class: Type[BaseMetric], data: Dict[Any, Any]) -> Union[ScoreResult, List[ScoreResult]]: + base_metric: BaseMetric = metric_class() + return base_metric.score(**data) + + +def to_scores(score_result: Union[ScoreResult, List[ScoreResult]]) -> List[ScoreResult]: + scores: List[ScoreResult] = [] + if isinstance(score_result, ScoreResult): + scores = [score_result] + elif isinstance(score_result, list): + for item in score_result: + if isinstance(item, ScoreResult): + scores.append(item) + return scores + + +@evaluator.errorhandler(400) +def bad_request(exception: HTTPException): + return jsonify(error=str(exception)), 400 + + +@evaluator.route("", methods=["POST"]) +def execute_evaluator(): + if request.method != "POST": + return + + payload: Any = request.get_json(force=True) + + code: str = payload.get("code") + if code is None: + abort(400, "Field 'code' is missing in the request") + + data: Dict[Any, Any] = payload.get("data") + if data is None: + abort(400, "Field 'data' is missing in the request") + + try: + module: ModuleType = get_module(code) + metric_class: Type[BaseMetric] = get_metric_class(module) + except Exception as exception: + current_app.logger.info("Exception getting metric class, message '%s', code '%s'", exception, code) + abort(400, "Field 'code' contains invalid Python code") + + if metric_class is None: + current_app.logger.info("Missing BaseMetric in code '%s'", code) + abort(400, + "Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'") + + score_result: List[ScoreResult] = [] + try: + score_result = evaluate_metric(metric_class, data) + except Exception as exception: + current_app.logger.info("Exception evaluating metric, message '%s', data '%s', code '%s'", + exception, data, code) + abort(400, "The provided 'code' and 'data' fields can't be evaluated") + + scores: List[ScoreResult] = to_scores(score_result) + if len(scores) == 0: + current_app.logger.info("Missing ScoreResult in code '%s'", code) + abort(400, "The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'") + + return jsonify({"scores": scores}) diff --git a/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/helpers/id_helpers.py b/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/helpers/id_helpers.py new file mode 100644 index 0000000000..56b8cb476b --- /dev/null +++ b/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/helpers/id_helpers.py @@ -0,0 +1,5 @@ +import uuid + + +def uuid4_str() -> str: + return str(uuid.uuid4()) diff --git a/apps/opik-backend/opik-embedded-python-backend/tests/conftest.py b/apps/opik-backend/opik-embedded-python-backend/tests/conftest.py new file mode 100644 index 0000000000..5ac958340b --- /dev/null +++ b/apps/opik-backend/opik-embedded-python-backend/tests/conftest.py @@ -0,0 +1,27 @@ +import pytest + +from opik_embedded_backend import create_app + + +@pytest.fixture() +def app(): + app = create_app() + app.config.update({ + "TESTING": True, + }) + + # other setup can go here + + yield app + + # clean up / reset resources here + + +@pytest.fixture() +def client(app): + return app.test_client() + + +@pytest.fixture() +def runner(app): + return app.test_cli_runner() diff --git a/apps/opik-backend/opik-embedded-python-backend/tests/test_evaluator.py b/apps/opik-backend/opik-embedded-python-backend/tests/test_evaluator.py new file mode 100644 index 0000000000..e86d67d777 --- /dev/null +++ b/apps/opik-backend/opik-embedded-python-backend/tests/test_evaluator.py @@ -0,0 +1,246 @@ +import pytest +from opik.evaluation.metrics.score_result import ScoreResult + +EVALUATORS_URL = "/v1/private/evaluators" + +USER_DEFINED_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + value = 1.0 if output == reference else 0.0 + return score_result.ScoreResult(value=value, name=self.name) +""" + +LIST_RESPONSE_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + value = 1.0 if output == reference else 0.0 + return [score_result.ScoreResult(value=value, name=self.name), score_result.ScoreResult(value=0.5, name=self.name)] +""" + +INVALID_METRIC = """ +from typing import + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + value = 1.0 if output == reference else 0.0 + return score_result.ScoreResult(value=value, name=self.name) +""" + +MISSING_BASE_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + value = 1.0 if output == reference else 0.0 + return score_result.ScoreResult(value=value, name=self.name) +""" + +SCORE_EXCEPTION_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + raise Exception("Exception while scoring") +""" + +MISSING_SCORE_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + return None +""" + +DATA = { + "output": "abc", + "reference": "abc" +} + + +@pytest.mark.parametrize("data,code, expected", [ + ( + DATA, + USER_DEFINED_METRIC, + [ScoreResult(name="user_defined_equals_metric", value=1.0).__dict__]), + ( + {"output": "abc", "reference": "ab"}, + USER_DEFINED_METRIC, + [ScoreResult(name="user_defined_equals_metric", value=0.0).__dict__]), + ( + DATA, + LIST_RESPONSE_METRIC, + [ + ScoreResult(name="user_defined_equals_metric", value=1.0).__dict__, + ScoreResult(name="user_defined_equals_metric", value=0.5).__dict__, + ] + ), + +]) +def test_success(client, data, code, expected): + response = client.post(EVALUATORS_URL, json={ + "data": data, + "code": code + }) + + assert response.status_code == 200 + assert response.json['scores'] == expected + + +def test_other_method_returns_method_not_allowed(client): + response = client.get(EVALUATORS_URL) + assert response.status_code == 405 + + +def test_options_method_returns_ok(client): + response = client.options(EVALUATORS_URL) + assert response.status_code == 200 + assert response.get_json() is None + + +def test_missing_request_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json=None) + assert response.status_code == 400 + assert response.json[ + "error"] == "400 Bad Request: The browser (or proxy) sent a request that this server could not understand." + + +def test_missing_code_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA + }) + assert response.status_code == 400 + assert response.json["error"] == "400 Bad Request: Field 'code' is missing in the request" + + +def test_missing_data_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "code": USER_DEFINED_METRIC + }) + assert response.status_code == 400 + assert response.json["error"] == "400 Bad Request: Field 'data' is missing in the request" + + +def test_invalid_code_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA, + "code": INVALID_METRIC + }) + assert response.status_code == 400 + assert response.json["error"] == "400 Bad Request: Field 'code' contains invalid Python code" + + +def test_missing_metric_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA, + "code": MISSING_BASE_METRIC + }) + assert response.status_code == 400 + assert response.json[ + "error"] == "400 Bad Request: Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'" + + +def test_evaluation_exception_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA, + "code": SCORE_EXCEPTION_METRIC + }) + assert response.status_code == 400 + assert response.json["error"] == "400 Bad Request: The provided 'code' and 'data' fields can't be evaluated" + + +def test_no_scores_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA, + "code": MISSING_SCORE_METRIC + }) + assert response.status_code == 400 + assert response.json[ + "error"] == "400 Bad Request: The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'" diff --git a/apps/opik-backend/opik-embedded-python-backend/tests/test_requirements.txt b/apps/opik-backend/opik-embedded-python-backend/tests/test_requirements.txt new file mode 100644 index 0000000000..d197ada2ff --- /dev/null +++ b/apps/opik-backend/opik-embedded-python-backend/tests/test_requirements.txt @@ -0,0 +1 @@ +pytest==8.3.4 From 0588c52b01317301809e24ddaf619a13ed4b7dc7 Mon Sep 17 00:00:00 2001 From: Andres Cruz Date: Tue, 24 Dec 2024 09:23:33 +0100 Subject: [PATCH 2/2] Rev2: moved as top level app --- .../README.md | 8 ++++---- .../requirements.txt | 0 .../src/opik_backend}/__init__.py | 6 +----- .../src/opik_backend}/evaluator.py | 0 .../src/opik_backend/helpers/__init__.py | 0 .../src/opik_backend}/helpers/id_helpers.py | 0 .../tests/conftest.py | 2 +- .../tests/test_evaluator.py | 10 +++++----- .../tests/test_requirements.txt | 0 9 files changed, 11 insertions(+), 15 deletions(-) rename apps/{opik-backend/opik-embedded-python-backend => opik-python-backend}/README.md (72%) rename apps/{opik-backend/opik-embedded-python-backend => opik-python-backend}/requirements.txt (100%) rename apps/{opik-backend/opik-embedded-python-backend/src/opik_embedded_backend => opik-python-backend/src/opik_backend}/__init__.py (79%) rename apps/{opik-backend/opik-embedded-python-backend/src/opik_embedded_backend => opik-python-backend/src/opik_backend}/evaluator.py (100%) create mode 100644 apps/opik-python-backend/src/opik_backend/helpers/__init__.py rename apps/{opik-backend/opik-embedded-python-backend/src/opik_embedded_backend => opik-python-backend/src/opik_backend}/helpers/id_helpers.py (100%) rename apps/{opik-backend/opik-embedded-python-backend => opik-python-backend}/tests/conftest.py (88%) rename apps/{opik-backend/opik-embedded-python-backend => opik-python-backend}/tests/test_evaluator.py (100%) rename apps/{opik-backend/opik-embedded-python-backend => opik-python-backend}/tests/test_requirements.txt (100%) diff --git a/apps/opik-backend/opik-embedded-python-backend/README.md b/apps/opik-python-backend/README.md similarity index 72% rename from apps/opik-backend/opik-embedded-python-backend/README.md rename to apps/opik-python-backend/README.md index f0f5c2b942..9a43dbe366 100644 --- a/apps/opik-backend/opik-embedded-python-backend/README.md +++ b/apps/opik-python-backend/README.md @@ -1,4 +1,4 @@ -# Opik Embedded Python Backend +# Opik Python Backend ## Requirements @@ -12,12 +12,12 @@ > [!TIP] > Run in debug mode for development purposes, it reloads the code automatically. -- From `apps/opik-backend/opik-embedded-python-backend` directory. -- Run the `opik_embedded_backend` module. +- From `apps/opik-python-backend` directory. +- Run the `opik_backend` module. - Debug mode is enabled with `--debug`. ```bash -flask --app src/opik_embedded_backend --debug run +flask --app src/opik_backend --debug run ``` Service is reachable at: `http://localhost:5000` diff --git a/apps/opik-backend/opik-embedded-python-backend/requirements.txt b/apps/opik-python-backend/requirements.txt similarity index 100% rename from apps/opik-backend/opik-embedded-python-backend/requirements.txt rename to apps/opik-python-backend/requirements.txt diff --git a/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/__init__.py b/apps/opik-python-backend/src/opik_backend/__init__.py similarity index 79% rename from apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/__init__.py rename to apps/opik-python-backend/src/opik_backend/__init__.py index c890aed9a5..3482ee389a 100644 --- a/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/__init__.py +++ b/apps/opik-python-backend/src/opik_backend/__init__.py @@ -1,9 +1,5 @@ -import logging - from flask import Flask -logger = logging.getLogger(__name__) - def create_app(test_config=None): app = Flask(__name__, instance_relative_config=True) @@ -15,7 +11,7 @@ def create_app(test_config=None): # load the test config if passed in app.config.from_mapping(test_config) - from opik_embedded_backend.evaluator import evaluator + from opik_backend.evaluator import evaluator app.register_blueprint(evaluator) return app diff --git a/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/evaluator.py b/apps/opik-python-backend/src/opik_backend/evaluator.py similarity index 100% rename from apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/evaluator.py rename to apps/opik-python-backend/src/opik_backend/evaluator.py diff --git a/apps/opik-python-backend/src/opik_backend/helpers/__init__.py b/apps/opik-python-backend/src/opik_backend/helpers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/helpers/id_helpers.py b/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py similarity index 100% rename from apps/opik-backend/opik-embedded-python-backend/src/opik_embedded_backend/helpers/id_helpers.py rename to apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py diff --git a/apps/opik-backend/opik-embedded-python-backend/tests/conftest.py b/apps/opik-python-backend/tests/conftest.py similarity index 88% rename from apps/opik-backend/opik-embedded-python-backend/tests/conftest.py rename to apps/opik-python-backend/tests/conftest.py index 5ac958340b..86d5b3c8e2 100644 --- a/apps/opik-backend/opik-embedded-python-backend/tests/conftest.py +++ b/apps/opik-python-backend/tests/conftest.py @@ -1,6 +1,6 @@ import pytest -from opik_embedded_backend import create_app +from opik_backend import create_app @pytest.fixture() diff --git a/apps/opik-backend/opik-embedded-python-backend/tests/test_evaluator.py b/apps/opik-python-backend/tests/test_evaluator.py similarity index 100% rename from apps/opik-backend/opik-embedded-python-backend/tests/test_evaluator.py rename to apps/opik-python-backend/tests/test_evaluator.py index e86d67d777..103600d707 100644 --- a/apps/opik-backend/opik-embedded-python-backend/tests/test_evaluator.py +++ b/apps/opik-python-backend/tests/test_evaluator.py @@ -174,17 +174,17 @@ def test_success(client, data, code, expected): assert response.json['scores'] == expected -def test_other_method_returns_method_not_allowed(client): - response = client.get(EVALUATORS_URL) - assert response.status_code == 405 - - def test_options_method_returns_ok(client): response = client.options(EVALUATORS_URL) assert response.status_code == 200 assert response.get_json() is None +def test_other_method_returns_method_not_allowed(client): + response = client.get(EVALUATORS_URL) + assert response.status_code == 405 + + def test_missing_request_returns_bad_request(client): response = client.post(EVALUATORS_URL, json=None) assert response.status_code == 400 diff --git a/apps/opik-backend/opik-embedded-python-backend/tests/test_requirements.txt b/apps/opik-python-backend/tests/test_requirements.txt similarity index 100% rename from apps/opik-backend/opik-embedded-python-backend/tests/test_requirements.txt rename to apps/opik-python-backend/tests/test_requirements.txt