diff --git a/apps/opik-python-backend/README.md b/apps/opik-python-backend/README.md
new file mode 100644
index 0000000000..9a43dbe366
--- /dev/null
+++ b/apps/opik-python-backend/README.md
@@ -0,0 +1,23 @@
+# Opik Python Backend
+
+## Requirements
+
+- Install Python: at least the minimum version compatible with the Opik Python SDK.
+- Create and enable a Python virtual environment.
+- Install all dependencies from `requirements.txt`.
+- For running tests, also install dependencies from `tests/test_requirements.txt`.
+
+## Running the Flask service
+
+> [!TIP]
+> Run in debug mode for development purposes, it reloads the code automatically.
+
+- From `apps/opik-python-backend` directory.
+- Run the `opik_backend` module.
+- Debug mode is enabled with `--debug`.
+
+```bash
+flask --app src/opik_backend --debug run
+```
+
+Service is reachable at: `http://localhost:5000`
diff --git a/apps/opik-python-backend/requirements.txt b/apps/opik-python-backend/requirements.txt
new file mode 100644
index 0000000000..83ccf1f693
--- /dev/null
+++ b/apps/opik-python-backend/requirements.txt
@@ -0,0 +1,61 @@
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.7.0
+attrs==24.3.0
+blinker==1.9.0
+certifi==2024.12.14
+charset-normalizer==3.4.0
+click==8.1.7
+distro==1.9.0
+filelock==3.16.1
+Flask==3.1.0
+frozenlist==1.5.0
+fsspec==2024.12.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.27.2
+huggingface-hub==0.27.0
+idna==3.10
+importlib_metadata==8.5.0
+iniconfig==2.0.0
+itsdangerous==2.2.0
+Jinja2==3.1.4
+jiter==0.8.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+Levenshtein==0.26.1
+litellm==1.55.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+multidict==6.1.0
+openai==1.58.1
+opik==1.3.0
+packaging==24.2
+pluggy==1.5.0
+propcache==0.2.1
+pydantic==2.10.4
+pydantic-settings==2.7.0
+pydantic_core==2.27.2
+Pygments==2.18.0
+python-dotenv==1.0.1
+PyYAML==6.0.2
+RapidFuzz==3.11.0
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+sniffio==1.3.1
+tenacity==9.0.0
+tiktoken==0.8.0
+tokenizers==0.21.0
+tqdm==4.67.1
+typing_extensions==4.12.2
+urllib3==2.2.3
+uuid7==0.1.0
+Werkzeug==3.1.3
+yarl==1.18.3
+zipp==3.21.0
diff --git a/apps/opik-python-backend/src/opik_backend/__init__.py b/apps/opik-python-backend/src/opik_backend/__init__.py
new file mode 100644
index 0000000000..3482ee389a
--- /dev/null
+++ b/apps/opik-python-backend/src/opik_backend/__init__.py
@@ -0,0 +1,17 @@
+from flask import Flask
+
+
+def create_app(test_config=None):
+    app = Flask(__name__, instance_relative_config=True)
+
+    if test_config is None:
+        # load the instance config, if it exists, when not testing
+        app.config.from_pyfile('config.py', silent=True)
+    else:
+        # load the test config if passed in
+        app.config.from_mapping(test_config)
+
+    from opik_backend.evaluator import evaluator
+    app.register_blueprint(evaluator)
+
+    return app
diff --git a/apps/opik-python-backend/src/opik_backend/evaluator.py b/apps/opik-python-backend/src/opik_backend/evaluator.py
new file mode 100644
index 0000000000..3b1f81e032
--- /dev/null
+++ b/apps/opik-python-backend/src/opik_backend/evaluator.py
@@ -0,0 +1,88 @@
+import inspect
+from types import ModuleType
+from typing import Type, Union, List, Any, Dict
+
+from flask import request, abort, jsonify, Blueprint, current_app
+from opik.evaluation.metrics import BaseMetric
+from opik.evaluation.metrics.score_result import ScoreResult
+from werkzeug.exceptions import HTTPException
+
+from .helpers.id_helpers import uuid4_str
+
+evaluator = Blueprint('evaluator', __name__, url_prefix='/v1/private/evaluators')
+
+
+def get_module(code: str, module_name: str = uuid4_str()) -> ModuleType:
+    module: ModuleType = ModuleType(module_name)
+    exec(code, module.__dict__)
+    return module
+
+
+def get_metric_class(module: ModuleType) -> Type[BaseMetric]:
+    for _, cls in inspect.getmembers(module, inspect.isclass):
+        if issubclass(cls, BaseMetric):
+            return cls
+
+
+def evaluate_metric(metric_class: Type[BaseMetric], data: Dict[Any, Any]) -> Union[ScoreResult, List[ScoreResult]]:
+    base_metric: BaseMetric = metric_class()
+    return base_metric.score(**data)
+
+
+def to_scores(score_result: Union[ScoreResult, List[ScoreResult]]) -> List[ScoreResult]:
+    scores: List[ScoreResult] = []
+    if isinstance(score_result, ScoreResult):
+        scores = [score_result]
+    elif isinstance(score_result, list):
+        for item in score_result:
+            if isinstance(item, ScoreResult):
+                scores.append(item)
+    return scores
+
+
+@evaluator.errorhandler(400)
+def bad_request(exception: HTTPException):
+    return jsonify(error=str(exception)), 400
+
+
+@evaluator.route("", methods=["POST"])
+def execute_evaluator():
+    if request.method != "POST":
+        return
+
+    payload: Any = request.get_json(force=True)
+
+    code: str = payload.get("code")
+    if code is None:
+        abort(400, "Field 'code' is missing in the request")
+
+    data: Dict[Any, Any] = payload.get("data")
+    if data is None:
+        abort(400, "Field 'data' is missing in the request")
+
+    try:
+        module: ModuleType = get_module(code)
+        metric_class: Type[BaseMetric] = get_metric_class(module)
+    except Exception as exception:
+        current_app.logger.info("Exception getting metric class, message '%s', code '%s'", exception, code)
+        abort(400, "Field 'code' contains invalid Python code")
+
+    if metric_class is None:
+        current_app.logger.info("Missing BaseMetric in code '%s'", code)
+        abort(400,
+              "Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'")
+
+    score_result: List[ScoreResult] = []
+    try:
+        score_result = evaluate_metric(metric_class, data)
+    except Exception as exception:
+        current_app.logger.info("Exception evaluating metric, message '%s', data '%s', code '%s'",
+                                exception, data, code)
+        abort(400, "The provided 'code' and 'data' fields can't be evaluated")
+
+    scores: List[ScoreResult] = to_scores(score_result)
+    if len(scores) == 0:
+        current_app.logger.info("Missing ScoreResult in code '%s'", code)
+        abort(400, "The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'")
+
+    return jsonify({"scores": scores})
diff --git a/apps/opik-python-backend/src/opik_backend/helpers/__init__.py b/apps/opik-python-backend/src/opik_backend/helpers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py b/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py
new file mode 100644
index 0000000000..56b8cb476b
--- /dev/null
+++ b/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py
@@ -0,0 +1,5 @@
+import uuid
+
+
+def uuid4_str() -> str:
+    return str(uuid.uuid4())
diff --git a/apps/opik-python-backend/tests/conftest.py b/apps/opik-python-backend/tests/conftest.py
new file mode 100644
index 0000000000..86d5b3c8e2
--- /dev/null
+++ b/apps/opik-python-backend/tests/conftest.py
@@ -0,0 +1,27 @@
+import pytest
+
+from opik_backend import create_app
+
+
+@pytest.fixture()
+def app():
+    app = create_app()
+    app.config.update({
+        "TESTING": True,
+    })
+
+    # other setup can go here
+
+    yield app
+
+    # clean up / reset resources here
+
+
+@pytest.fixture()
+def client(app):
+    return app.test_client()
+
+
+@pytest.fixture()
+def runner(app):
+    return app.test_cli_runner()
diff --git a/apps/opik-python-backend/tests/test_evaluator.py b/apps/opik-python-backend/tests/test_evaluator.py
new file mode 100644
index 0000000000..103600d707
--- /dev/null
+++ b/apps/opik-python-backend/tests/test_evaluator.py
@@ -0,0 +1,246 @@
+import pytest
+from opik.evaluation.metrics.score_result import ScoreResult
+
+EVALUATORS_URL = "/v1/private/evaluators"
+
+USER_DEFINED_METRIC = """
+from typing import Any
+
+from opik.evaluation.metrics import base_metric, score_result
+
+
+class UserDefinedEquals(base_metric.BaseMetric):
+    def __init__(
+        self,
+        name: str = "user_defined_equals_metric",
+    ):
+        super().__init__(
+            name=name,
+            track=False,
+        )
+
+    def score(
+        self, output: str, reference: str, **ignored_kwargs: Any
+    ) -> score_result.ScoreResult:
+        value = 1.0 if output == reference else 0.0
+        return score_result.ScoreResult(value=value, name=self.name)
+"""
+
+LIST_RESPONSE_METRIC = """
+from typing import Any
+
+from opik.evaluation.metrics import base_metric, score_result
+
+
+class UserDefinedEquals(base_metric.BaseMetric):
+    def __init__(
+        self,
+        name: str = "user_defined_equals_metric",
+    ):
+        super().__init__(
+            name=name,
+            track=False,
+        )
+
+    def score(
+        self, output: str, reference: str, **ignored_kwargs: Any
+    ) -> score_result.ScoreResult:
+        value = 1.0 if output == reference else 0.0
+        return [score_result.ScoreResult(value=value, name=self.name), score_result.ScoreResult(value=0.5, name=self.name)]
+"""
+
+INVALID_METRIC = """
+from typing import
+
+from opik.evaluation.metrics import base_metric, score_result
+
+
+class UserDefinedEquals(base_metric.BaseMetric):
+    def __init__(
+        self,
+        name: str = "user_defined_equals_metric",
+    ):
+        super().__init__(
+            name=name,
+            track=False,
+        )
+
+    def score(
+        self, output: str, reference: str, **ignored_kwargs: Any
+    ) -> score_result.ScoreResult:
+        value = 1.0 if output == reference else 0.0
+        return score_result.ScoreResult(value=value, name=self.name)
+"""
+
+MISSING_BASE_METRIC = """
+from typing import Any
+
+from opik.evaluation.metrics import base_metric, score_result
+
+
+class UserDefinedEquals():
+    def __init__(
+        self,
+        name: str = "user_defined_equals_metric",
+    ):
+        super().__init__(
+            name=name,
+            track=False,
+        )
+
+    def score(
+        self, output: str, reference: str, **ignored_kwargs: Any
+    ) -> score_result.ScoreResult:
+        value = 1.0 if output == reference else 0.0
+        return score_result.ScoreResult(value=value, name=self.name)
+"""
+
+SCORE_EXCEPTION_METRIC = """
+from typing import Any
+
+from opik.evaluation.metrics import base_metric, score_result
+
+
+class UserDefinedEquals(base_metric.BaseMetric):
+    def __init__(
+        self,
+        name: str = "user_defined_equals_metric",
+    ):
+        super().__init__(
+            name=name,
+            track=False,
+        )
+
+    def score(
+        self, output: str, reference: str, **ignored_kwargs: Any
+    ) -> score_result.ScoreResult:
+        raise Exception("Exception while scoring")
+"""
+
+MISSING_SCORE_METRIC = """
+from typing import Any
+
+from opik.evaluation.metrics import base_metric, score_result
+
+
+class UserDefinedEquals(base_metric.BaseMetric):
+    def __init__(
+        self,
+        name: str = "user_defined_equals_metric",
+    ):
+        super().__init__(
+            name=name,
+            track=False,
+        )
+
+    def score(
+        self, output: str, reference: str, **ignored_kwargs: Any
+    ) -> score_result.ScoreResult:
+        return None
+"""
+
+DATA = {
+    "output": "abc",
+    "reference": "abc"
+}
+
+
+@pytest.mark.parametrize("data,code, expected", [
+    (
+            DATA,
+            USER_DEFINED_METRIC,
+            [ScoreResult(name="user_defined_equals_metric", value=1.0).__dict__]),
+    (
+            {"output": "abc", "reference": "ab"},
+            USER_DEFINED_METRIC,
+            [ScoreResult(name="user_defined_equals_metric", value=0.0).__dict__]),
+    (
+            DATA,
+            LIST_RESPONSE_METRIC,
+            [
+                ScoreResult(name="user_defined_equals_metric", value=1.0).__dict__,
+                ScoreResult(name="user_defined_equals_metric", value=0.5).__dict__,
+            ]
+    ),
+
+])
+def test_success(client, data, code, expected):
+    response = client.post(EVALUATORS_URL, json={
+        "data": data,
+        "code": code
+    })
+
+    assert response.status_code == 200
+    assert response.json['scores'] == expected
+
+
+def test_options_method_returns_ok(client):
+    response = client.options(EVALUATORS_URL)
+    assert response.status_code == 200
+    assert response.get_json() is None
+
+
+def test_other_method_returns_method_not_allowed(client):
+    response = client.get(EVALUATORS_URL)
+    assert response.status_code == 405
+
+
+def test_missing_request_returns_bad_request(client):
+    response = client.post(EVALUATORS_URL, json=None)
+    assert response.status_code == 400
+    assert response.json[
+               "error"] == "400 Bad Request: The browser (or proxy) sent a request that this server could not understand."
+
+
+def test_missing_code_returns_bad_request(client):
+    response = client.post(EVALUATORS_URL, json={
+        "data": DATA
+    })
+    assert response.status_code == 400
+    assert response.json["error"] == "400 Bad Request: Field 'code' is missing in the request"
+
+
+def test_missing_data_returns_bad_request(client):
+    response = client.post(EVALUATORS_URL, json={
+        "code": USER_DEFINED_METRIC
+    })
+    assert response.status_code == 400
+    assert response.json["error"] == "400 Bad Request: Field 'data' is missing in the request"
+
+
+def test_invalid_code_returns_bad_request(client):
+    response = client.post(EVALUATORS_URL, json={
+        "data": DATA,
+        "code": INVALID_METRIC
+    })
+    assert response.status_code == 400
+    assert response.json["error"] == "400 Bad Request: Field 'code' contains invalid Python code"
+
+
+def test_missing_metric_returns_bad_request(client):
+    response = client.post(EVALUATORS_URL, json={
+        "data": DATA,
+        "code": MISSING_BASE_METRIC
+    })
+    assert response.status_code == 400
+    assert response.json[
+               "error"] == "400 Bad Request: Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'"
+
+
+def test_evaluation_exception_returns_bad_request(client):
+    response = client.post(EVALUATORS_URL, json={
+        "data": DATA,
+        "code": SCORE_EXCEPTION_METRIC
+    })
+    assert response.status_code == 400
+    assert response.json["error"] == "400 Bad Request: The provided 'code' and 'data' fields can't be evaluated"
+
+
+def test_no_scores_returns_bad_request(client):
+    response = client.post(EVALUATORS_URL, json={
+        "data": DATA,
+        "code": MISSING_SCORE_METRIC
+    })
+    assert response.status_code == 400
+    assert response.json[
+               "error"] == "400 Bad Request: The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'"
diff --git a/apps/opik-python-backend/tests/test_requirements.txt b/apps/opik-python-backend/tests/test_requirements.txt
new file mode 100644
index 0000000000..d197ada2ff
--- /dev/null
+++ b/apps/opik-python-backend/tests/test_requirements.txt
@@ -0,0 +1 @@
+pytest==8.3.4