diff --git a/apps/opik-python-backend/README.md b/apps/opik-python-backend/README.md new file mode 100644 index 0000000000..9a43dbe366 --- /dev/null +++ b/apps/opik-python-backend/README.md @@ -0,0 +1,23 @@ +# Opik Python Backend + +## Requirements + +- Install Python: at least the minimum version compatible with the Opik Python SDK. +- Create and enable a Python virtual environment. +- Install all dependencies from `requirements.txt`. +- For running tests, also install dependencies from `tests/test_requirements.txt`. + +## Running the Flask service + +> [!TIP] +> Run in debug mode for development purposes, it reloads the code automatically. + +- From `apps/opik-python-backend` directory. +- Run the `opik_backend` module. +- Debug mode is enabled with `--debug`. + +```bash +flask --app src/opik_backend --debug run +``` + +Service is reachable at: `http://localhost:5000` diff --git a/apps/opik-python-backend/requirements.txt b/apps/opik-python-backend/requirements.txt new file mode 100644 index 0000000000..83ccf1f693 --- /dev/null +++ b/apps/opik-python-backend/requirements.txt @@ -0,0 +1,61 @@ +aiohappyeyeballs==2.4.4 +aiohttp==3.11.11 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.7.0 +attrs==24.3.0 +blinker==1.9.0 +certifi==2024.12.14 +charset-normalizer==3.4.0 +click==8.1.7 +distro==1.9.0 +filelock==3.16.1 +Flask==3.1.0 +frozenlist==1.5.0 +fsspec==2024.12.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.27.2 +huggingface-hub==0.27.0 +idna==3.10 +importlib_metadata==8.5.0 +iniconfig==2.0.0 +itsdangerous==2.2.0 +Jinja2==3.1.4 +jiter==0.8.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +Levenshtein==0.26.1 +litellm==1.55.7 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +multidict==6.1.0 +openai==1.58.1 +opik==1.3.0 +packaging==24.2 +pluggy==1.5.0 +propcache==0.2.1 +pydantic==2.10.4 +pydantic-settings==2.7.0 +pydantic_core==2.27.2 +Pygments==2.18.0 +python-dotenv==1.0.1 +PyYAML==6.0.2 +RapidFuzz==3.11.0 +referencing==0.35.1 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.22.3 +sniffio==1.3.1 +tenacity==9.0.0 +tiktoken==0.8.0 +tokenizers==0.21.0 +tqdm==4.67.1 +typing_extensions==4.12.2 +urllib3==2.2.3 +uuid7==0.1.0 +Werkzeug==3.1.3 +yarl==1.18.3 +zipp==3.21.0 diff --git a/apps/opik-python-backend/src/opik_backend/__init__.py b/apps/opik-python-backend/src/opik_backend/__init__.py new file mode 100644 index 0000000000..3482ee389a --- /dev/null +++ b/apps/opik-python-backend/src/opik_backend/__init__.py @@ -0,0 +1,17 @@ +from flask import Flask + + +def create_app(test_config=None): + app = Flask(__name__, instance_relative_config=True) + + if test_config is None: + # load the instance config, if it exists, when not testing + app.config.from_pyfile('config.py', silent=True) + else: + # load the test config if passed in + app.config.from_mapping(test_config) + + from opik_backend.evaluator import evaluator + app.register_blueprint(evaluator) + + return app diff --git a/apps/opik-python-backend/src/opik_backend/evaluator.py b/apps/opik-python-backend/src/opik_backend/evaluator.py new file mode 100644 index 0000000000..3b1f81e032 --- /dev/null +++ b/apps/opik-python-backend/src/opik_backend/evaluator.py @@ -0,0 +1,88 @@ +import inspect +from types import ModuleType +from typing import Type, Union, List, Any, Dict + +from flask import request, abort, jsonify, Blueprint, current_app +from opik.evaluation.metrics import BaseMetric +from opik.evaluation.metrics.score_result import ScoreResult +from werkzeug.exceptions import HTTPException + +from .helpers.id_helpers import uuid4_str + +evaluator = Blueprint('evaluator', __name__, url_prefix='/v1/private/evaluators') + + +def get_module(code: str, module_name: str = uuid4_str()) -> ModuleType: + module: ModuleType = ModuleType(module_name) + exec(code, module.__dict__) + return module + + +def get_metric_class(module: ModuleType) -> Type[BaseMetric]: + for _, cls in inspect.getmembers(module, inspect.isclass): + if issubclass(cls, BaseMetric): + return cls + + +def evaluate_metric(metric_class: Type[BaseMetric], data: Dict[Any, Any]) -> Union[ScoreResult, List[ScoreResult]]: + base_metric: BaseMetric = metric_class() + return base_metric.score(**data) + + +def to_scores(score_result: Union[ScoreResult, List[ScoreResult]]) -> List[ScoreResult]: + scores: List[ScoreResult] = [] + if isinstance(score_result, ScoreResult): + scores = [score_result] + elif isinstance(score_result, list): + for item in score_result: + if isinstance(item, ScoreResult): + scores.append(item) + return scores + + +@evaluator.errorhandler(400) +def bad_request(exception: HTTPException): + return jsonify(error=str(exception)), 400 + + +@evaluator.route("", methods=["POST"]) +def execute_evaluator(): + if request.method != "POST": + return + + payload: Any = request.get_json(force=True) + + code: str = payload.get("code") + if code is None: + abort(400, "Field 'code' is missing in the request") + + data: Dict[Any, Any] = payload.get("data") + if data is None: + abort(400, "Field 'data' is missing in the request") + + try: + module: ModuleType = get_module(code) + metric_class: Type[BaseMetric] = get_metric_class(module) + except Exception as exception: + current_app.logger.info("Exception getting metric class, message '%s', code '%s'", exception, code) + abort(400, "Field 'code' contains invalid Python code") + + if metric_class is None: + current_app.logger.info("Missing BaseMetric in code '%s'", code) + abort(400, + "Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'") + + score_result: List[ScoreResult] = [] + try: + score_result = evaluate_metric(metric_class, data) + except Exception as exception: + current_app.logger.info("Exception evaluating metric, message '%s', data '%s', code '%s'", + exception, data, code) + abort(400, "The provided 'code' and 'data' fields can't be evaluated") + + scores: List[ScoreResult] = to_scores(score_result) + if len(scores) == 0: + current_app.logger.info("Missing ScoreResult in code '%s'", code) + abort(400, "The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'") + + return jsonify({"scores": scores}) diff --git a/apps/opik-python-backend/src/opik_backend/helpers/__init__.py b/apps/opik-python-backend/src/opik_backend/helpers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py b/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py new file mode 100644 index 0000000000..56b8cb476b --- /dev/null +++ b/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py @@ -0,0 +1,5 @@ +import uuid + + +def uuid4_str() -> str: + return str(uuid.uuid4()) diff --git a/apps/opik-python-backend/tests/conftest.py b/apps/opik-python-backend/tests/conftest.py new file mode 100644 index 0000000000..86d5b3c8e2 --- /dev/null +++ b/apps/opik-python-backend/tests/conftest.py @@ -0,0 +1,27 @@ +import pytest + +from opik_backend import create_app + + +@pytest.fixture() +def app(): + app = create_app() + app.config.update({ + "TESTING": True, + }) + + # other setup can go here + + yield app + + # clean up / reset resources here + + +@pytest.fixture() +def client(app): + return app.test_client() + + +@pytest.fixture() +def runner(app): + return app.test_cli_runner() diff --git a/apps/opik-python-backend/tests/test_evaluator.py b/apps/opik-python-backend/tests/test_evaluator.py new file mode 100644 index 0000000000..103600d707 --- /dev/null +++ b/apps/opik-python-backend/tests/test_evaluator.py @@ -0,0 +1,246 @@ +import pytest +from opik.evaluation.metrics.score_result import ScoreResult + +EVALUATORS_URL = "/v1/private/evaluators" + +USER_DEFINED_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + value = 1.0 if output == reference else 0.0 + return score_result.ScoreResult(value=value, name=self.name) +""" + +LIST_RESPONSE_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + value = 1.0 if output == reference else 0.0 + return [score_result.ScoreResult(value=value, name=self.name), score_result.ScoreResult(value=0.5, name=self.name)] +""" + +INVALID_METRIC = """ +from typing import + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + value = 1.0 if output == reference else 0.0 + return score_result.ScoreResult(value=value, name=self.name) +""" + +MISSING_BASE_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + value = 1.0 if output == reference else 0.0 + return score_result.ScoreResult(value=value, name=self.name) +""" + +SCORE_EXCEPTION_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + raise Exception("Exception while scoring") +""" + +MISSING_SCORE_METRIC = """ +from typing import Any + +from opik.evaluation.metrics import base_metric, score_result + + +class UserDefinedEquals(base_metric.BaseMetric): + def __init__( + self, + name: str = "user_defined_equals_metric", + ): + super().__init__( + name=name, + track=False, + ) + + def score( + self, output: str, reference: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + return None +""" + +DATA = { + "output": "abc", + "reference": "abc" +} + + +@pytest.mark.parametrize("data,code, expected", [ + ( + DATA, + USER_DEFINED_METRIC, + [ScoreResult(name="user_defined_equals_metric", value=1.0).__dict__]), + ( + {"output": "abc", "reference": "ab"}, + USER_DEFINED_METRIC, + [ScoreResult(name="user_defined_equals_metric", value=0.0).__dict__]), + ( + DATA, + LIST_RESPONSE_METRIC, + [ + ScoreResult(name="user_defined_equals_metric", value=1.0).__dict__, + ScoreResult(name="user_defined_equals_metric", value=0.5).__dict__, + ] + ), + +]) +def test_success(client, data, code, expected): + response = client.post(EVALUATORS_URL, json={ + "data": data, + "code": code + }) + + assert response.status_code == 200 + assert response.json['scores'] == expected + + +def test_options_method_returns_ok(client): + response = client.options(EVALUATORS_URL) + assert response.status_code == 200 + assert response.get_json() is None + + +def test_other_method_returns_method_not_allowed(client): + response = client.get(EVALUATORS_URL) + assert response.status_code == 405 + + +def test_missing_request_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json=None) + assert response.status_code == 400 + assert response.json[ + "error"] == "400 Bad Request: The browser (or proxy) sent a request that this server could not understand." + + +def test_missing_code_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA + }) + assert response.status_code == 400 + assert response.json["error"] == "400 Bad Request: Field 'code' is missing in the request" + + +def test_missing_data_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "code": USER_DEFINED_METRIC + }) + assert response.status_code == 400 + assert response.json["error"] == "400 Bad Request: Field 'data' is missing in the request" + + +def test_invalid_code_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA, + "code": INVALID_METRIC + }) + assert response.status_code == 400 + assert response.json["error"] == "400 Bad Request: Field 'code' contains invalid Python code" + + +def test_missing_metric_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA, + "code": MISSING_BASE_METRIC + }) + assert response.status_code == 400 + assert response.json[ + "error"] == "400 Bad Request: Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'" + + +def test_evaluation_exception_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA, + "code": SCORE_EXCEPTION_METRIC + }) + assert response.status_code == 400 + assert response.json["error"] == "400 Bad Request: The provided 'code' and 'data' fields can't be evaluated" + + +def test_no_scores_returns_bad_request(client): + response = client.post(EVALUATORS_URL, json={ + "data": DATA, + "code": MISSING_SCORE_METRIC + }) + assert response.status_code == 400 + assert response.json[ + "error"] == "400 Bad Request: The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'" diff --git a/apps/opik-python-backend/tests/test_requirements.txt b/apps/opik-python-backend/tests/test_requirements.txt new file mode 100644 index 0000000000..d197ada2ff --- /dev/null +++ b/apps/opik-python-backend/tests/test_requirements.txt @@ -0,0 +1 @@ +pytest==8.3.4