diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 31ff92d0..d6cf4205 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -11,6 +11,10 @@ jobs:
   build-backend-docker:
     runs-on: self-hosted
     steps:
+    - name: Setup python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
     - name: Checkout code
       uses: actions/checkout@v4
     - name: Setup prereqs
@@ -24,10 +28,23 @@ jobs:
         cp backend/.env.example backend/.env
         sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env
         sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env
+        cp backend/.env evaluation/.env
+        cp backend/.env frontend/.env
         cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src
+        cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src
     - name: Build Docker image
       run: |
         make docker
+        sleep 900 # TODO: Remove this after docker-compose healthcheck timeout restored fixed.
+    - name: Run LLM CI
+      working-directory: evaluation
+      run: |
+        make llm-tests
+    - name: Create commit comment
+      uses: peter-evans/commit-comment@v3
+      with:
+        token: ${{ secrets.GH_PAT }}
+        body-path: evaluation/auto_evaluation/llm_tests_output.txt
     - name: Teardown
       if: always()
       run: |
diff --git a/.gitignore b/.gitignore
index 10bfa179..14a3dc29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ __pycache__/
 backend/data/*
 backend/src/*.json
 *.pyc
+*.egg-info/
 frontend/*.json  
 evaluation/human_evaluation/*.json  
 /*.json
@@ -21,7 +22,8 @@ documents.txt
 .venv
 
 # evaluations
-.deepeval_telemtry.txt
+**/.deepeval_telemtry.txt
 *.csv
-*.deepeval-cache.json
+**/.deepeval-cache.json
 temp_test_run_data.json
+**/llm_tests_output.txt
diff --git a/Makefile b/Makefile
index 1ebc3f65..1c6a81fa 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,6 @@
-FOLDERS=backend frontend
+.PHONY: init init-dev format check
+
+FOLDERS=backend frontend evaluation
 
 init:
 	@for folder in $(FOLDERS); do (cd $$folder && make init && cd ../); done
diff --git a/backend/Dockerfile b/backend/Dockerfile
index bc6e29f1..bc6737f8 100644
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py
 
 EXPOSE 8000
 
-CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
diff --git a/backend/src/api/routers/graphs.py b/backend/src/api/routers/graphs.py
index 0666ab78..93e8b13f 100644
--- a/backend/src/api/routers/graphs.py
+++ b/backend/src/api/routers/graphs.py
@@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse:
         tool_index = 1
         for tool in tools:
             urls.extend(list(output[tool_index].values())[0]["urls"])
-            context.extend(list(set(list(output[tool_index].values())[0]["context"])))
+            context.append(list(output[tool_index].values())[0]["context"])
             tool_index += 1
     else:
         llm_response = "LLM response extraction failed"
diff --git a/backend/src/tools/format_docs.py b/backend/src/tools/format_docs.py
index bcd9fbf8..a2376c41 100644
--- a/backend/src/tools/format_docs.py
+++ b/backend/src/tools/format_docs.py
@@ -5,7 +5,7 @@
 
 def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
     doc_text = ""
-    doc_texts = ""
+    doc_texts = []
     doc_urls = []
     doc_srcs = []
 
@@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
                 doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}"
             else:
                 doc_text = doc.page_content
+            doc_texts.append(doc_text)
 
         if "url" in doc.metadata:
             doc_urls.append(doc.metadata["url"])
+    
+    doc_output = "\n\n -------------------------- \n\n".join(doc_texts)
 
-        doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}"
-
-    return doc_texts, doc_srcs, doc_urls
+    return doc_output, doc_srcs, doc_urls
diff --git a/evaluation/Makefile b/evaluation/Makefile
index 72878508..d0dd015f 100644
--- a/evaluation/Makefile
+++ b/evaluation/Makefile
@@ -1,7 +1,10 @@
+.PHONY: init init-dev format check clean
+
 init:
 	@python3 -m venv .venv && \
 		. .venv/bin/activate && \
-		pip install -r requirements.txt
+		pip install -r requirements.txt && \
+		pip install -e .
 
 init-dev: init
 	@. .venv/bin/activate && \
@@ -15,3 +18,12 @@ format:
 check:
 	@. .venv/bin/activate && \
 		ruff check --fix
+
+clean:
+	@rm -f llm_tests_output.txt
+	@rm -f **/.deepeval-cache.json
+
+llm-tests: clean
+	@. .venv/bin/activate && \
+		cd auto_evaluation && \
+		./llm_tests.sh 2>&1 | tee llm_tests_output.txt
diff --git a/evaluation/auto_evaluation/__init__.py b/evaluation/auto_evaluation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/evaluation/auto_evaluation/content_metrics.json b/evaluation/auto_evaluation/content_metrics.json
deleted file mode 100644
index 274e99ee..00000000
--- a/evaluation/auto_evaluation/content_metrics.json
+++ /dev/null
@@ -1 +0,0 @@
-{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdicts\": [\n            {\n                \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n                \"verdict\": \"yes\",\n                \"reason\": null\n            }\n        ]\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n    \"Customers are eligible for a full refund.\",\n    \"The refund period lasts 30 days.\",\n    \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n    \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}, "{\"actual_output\": \"The capital of France is Paris.\", \"context\": null, \"expected_output\": \"Paris.\", \"hyperparameters\": null, \"input\": \"What is the capital of France?\", \"retrieval_context\": null}": {"cached_metrics_data": [{"metric_data": {"name": "Answer Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the response is perfectly relevant, addressing the input directly and completely with no irrelevant information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Statements:\n[\n    \"The capital of France is Paris.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Bias", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output demonstrates no discernible bias.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Toxicity", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output is entirely harmless and positive.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}
\ No newline at end of file
diff --git a/evaluation/auto_evaluation/dataset/hf_pull.py b/evaluation/auto_evaluation/dataset/hf_pull.py
index c0541cbf..8df9a062 100644
--- a/evaluation/auto_evaluation/dataset/hf_pull.py
+++ b/evaluation/auto_evaluation/dataset/hf_pull.py
@@ -1,7 +1,8 @@
 from huggingface_hub import snapshot_download
 import os
 
-if __name__ == "__main__":
+
+def main():
     cur_dir = os.path.dirname(os.path.abspath(__file__))
     snapshot_download(
         "The-OpenROAD-Project/ORAssistant_Public_Evals",
@@ -13,3 +14,7 @@
             "README.md",
         ],
     )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py
new file mode 100644
index 00000000..3a7c6f9f
--- /dev/null
+++ b/evaluation/auto_evaluation/dataset/preprocess.py
@@ -0,0 +1,60 @@
+import csv
+import json
+from typing import Any
+
+
+def read_data(csv_file: str) -> list[dict]:
+    questions = []
+    with open(csv_file, "r") as f:
+        reader = csv.reader(f)
+        header = next(reader)  # Skip the header row
+        assert len(header) == 2, "CSV file must have exactly 2 columns"
+        for row in reader:
+            questions.append(
+                {"question": row[0].strip(), "ground_truth": row[1].strip()}
+            )
+    return questions
+
+
+def write_data(results_list: list[dict[str, Any]], results_path: str):
+    keys = results_list[0].keys()
+    with open(results_path, "w") as f:
+        writer = csv.writer(f)
+        writer.writerow(list(keys))
+        for result in results_list:
+            writer.writerow([result[key] for key in keys])
+    print(f"Results written to {results_path}")
+
+
+def read_deepeval_cache():
+    metric_scores = {
+        "Contextual Precision": [],
+        "Contextual Recall": [],
+        "Hallucination": [],
+    }
+    metric_passes = {
+        "Contextual Precision": [],
+        "Contextual Recall": [],
+        "Hallucination": [],
+    }
+    with open(".deepeval-cache.json") as f:
+        results = json.load(f)
+    for _, value in results["test_cases_lookup_map"].items():
+        for metric in value["cached_metrics_data"]:
+            metric_scores[metric["metric_data"]["name"]].append(
+                metric["metric_data"]["score"]
+            )
+            metric_passes[metric["metric_data"]["name"]].append(
+                metric["metric_data"]["success"]
+            )
+
+    print("Average Metric Scores: ")
+    for key, value in metric_scores.items():
+        print(key, sum(value) / len(value))
+    print("Metric Passrates: ")
+    for key, value in metric_passes.items():
+        print(key, value.count(True) / len(value))
+
+
+if __name__ == "__main__":
+    read_deepeval_cache()
diff --git a/evaluation/auto_evaluation/demo.py b/evaluation/auto_evaluation/demo.py
deleted file mode 100644
index 7b7b909f..00000000
--- a/evaluation/auto_evaluation/demo.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import os
-
-from dotenv import load_dotenv
-from src.models.vertex_ai import GoogleVertexAILangChain
-
-# from src.metrics.geval import make_correctness_metric
-from src.metrics.content import (
-    make_bias_metric,
-    make_toxicity_metric,
-    make_answer_relevancy_metric,
-)
-from src.metrics.retrieval import (
-    make_contextual_precision_metric,
-    make_contextual_recall_metric,
-    make_contextual_relevancy_metric,
-    make_faithfulness_metric,
-    make_hallucination_metric,
-)
-from deepeval.test_case import LLMTestCase
-from deepeval import evaluate
-
-cur_dir = os.path.dirname(__file__)
-root_dir = os.path.join(cur_dir, "../../")
-load_dotenv(os.path.join(root_dir, ".env"))
-
-if __name__ == "__main__":
-    model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
-    print("Retrieval metrics")
-    precision, recall, relevancy, faithfulness, hallucination = (
-        make_contextual_precision_metric(model),
-        make_contextual_recall_metric(model),
-        make_contextual_relevancy_metric(model),
-        make_faithfulness_metric(model),
-        make_hallucination_metric(model),
-    )
-
-    test_case = LLMTestCase(
-        input="What if these shoes don't fit?",
-        actual_output="We offer a 30-day full refund at no extra cost.",
-        expected_output="You are eligible for a 30 day full refund at no extra cost.",
-        context=[
-            "All customers are eligible for a 30 day full refund at no extra cost."
-        ],
-        retrieval_context=[
-            "All customers are eligible for a 30 day full refund at no extra cost."
-        ],
-    )
-    evaluate([test_case], [precision, recall, relevancy, faithfulness, hallucination])
-    os.rename(".deepeval-cache.json", "retrieval_metrics.json")
-
-    print("Content metrics")
-    answer_relevancy, bias, toxicity = (
-        make_answer_relevancy_metric(model),
-        make_bias_metric(model),
-        make_toxicity_metric(model),
-    )
-
-    test_case = LLMTestCase(
-        input="What is the capital of France?",
-        actual_output="The capital of France is Paris.",
-        expected_output="Paris.",
-    )
-    evaluate([test_case], [answer_relevancy, bias, toxicity])
-    os.rename(".deepeval-cache.json", "content_metrics.json")
diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py
new file mode 100644
index 00000000..aac81503
--- /dev/null
+++ b/evaluation/auto_evaluation/eval_main.py
@@ -0,0 +1,144 @@
+"""
+Evaluation script which takes in arguments to dataset and
+the model to evaluate on the dataset.
+"""
+
+import argparse
+import time
+import requests
+import os
+
+from dotenv import load_dotenv
+from deepeval.test_case import LLMTestCase
+from deepeval import evaluate
+
+from auto_evaluation.src.models.vertex_ai import GoogleVertexAILangChain
+from auto_evaluation.src.metrics.retrieval import (
+    make_contextual_precision_metric,
+    make_contextual_recall_metric,
+    make_hallucination_metric,
+)
+from auto_evaluation.dataset import hf_pull, preprocess
+from tqdm import tqdm  # type: ignore
+
+eval_root_path = os.path.join(os.path.dirname(__file__), "..")
+load_dotenv(dotenv_path=os.path.join(eval_root_path, ".env"))
+
+# List of all available retrievers
+ALL_RETRIEVERS = {
+    "agent-retriever": "/graphs/agent-retriever",
+    "agent-retriever-reranker": "/graphs/agent-retriever",
+    "hybrid": "/graphs/hybrid",
+    "sim": "/graphs/sim",
+    "ensemble": "/graphs/ensemble",
+}
+
+
+class EvaluationHarness:
+    # TODO: Use async for EvaluationHarness.
+    # TODO: Also requires LLM Engine to be async
+    def __init__(self, base_url: str, dataset: str, reranker_base_url: str = ""):
+        self.base_url = base_url
+        self.dataset = dataset
+        self.reranker_base_url = reranker_base_url
+        self.qns = preprocess.read_data(self.dataset)
+        self.eval_model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
+        self.log_dir = "logs"
+        os.makedirs(self.log_dir, exist_ok=True)
+        self.sanity_check()
+
+    def sanity_check(self):
+        if not requests.get(f"{self.base_url}/healthcheck").status_code == 200:
+            raise ValueError("Endpoint is not running")
+        if not os.path.exists(self.dataset):
+            raise ValueError("Dataset path does not exist")
+        if (
+            self.reranker_base_url
+            and not requests.get(f"{self.reranker_base_url}/healthcheck").status_code
+            == 200
+        ):
+            raise ValueError("Reranker endpoint is not running")
+
+    def evaluate(self, retriever: str):
+        retrieval_tcs = []
+        response_times = []
+
+        # metrics
+        precision, recall, hallucination = (
+            make_contextual_precision_metric(self.eval_model),
+            make_contextual_recall_metric(self.eval_model),
+            make_hallucination_metric(self.eval_model),
+        )
+
+        # retrieval test cases
+        for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
+            question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
+            response, response_time = self.query(retriever, question)
+            response_text = response["response"]
+            context = response["context"]
+            context_list = context[0].split("--------------------------")
+
+            # works for: precision, recall, hallucination
+            retrieval_tc = LLMTestCase(
+                input=question,
+                actual_output=response_text,
+                expected_output=ground_truth,
+                context=context_list,
+                retrieval_context=context_list,
+            )
+            retrieval_tcs.append(retrieval_tc)
+            response_times.append(response_time)
+
+        # parallel evaluate
+        evaluate(
+            retrieval_tcs,
+            [precision, recall, hallucination],
+            print_results=False,
+        )
+
+        # parse deepeval results
+        preprocess.read_deepeval_cache()
+
+    def query(self, retriever: str, query: str) -> tuple[dict, float]:
+        """
+        Returns the response json and the time taken to get the response (ms)
+        """
+        endpoint = ALL_RETRIEVERS[retriever]
+        url = (
+            f"{self.base_url}/{endpoint}"
+            if retriever != "agent-retriever-reranker"
+            else f"{self.reranker_base_url}/{endpoint}"
+        )
+        payload = {"query": query, "list_context": True, "list_sources": False}
+        try:
+            time.sleep(5)
+            response = requests.post(url, json=payload)
+            return response.json(), response.elapsed.total_seconds() * 1000
+        except Exception as e:
+            print(f"Error querying {retriever}: {e}")
+            return {
+                "response": "invalid",
+                "sources": [],
+                "context": [],
+                "tool": "string",
+            }, -999999
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluation script")
+    parser.add_argument(
+        "--base_url", type=str, help="Base URL of the model to evaluate"
+    )
+    parser.add_argument(
+        "--reranker_base_url", type=str, help="Base URL of the reranker", default=""
+    )
+    parser.add_argument("--dataset", type=str, help="Path to dataset to evaluate on")
+    parser.add_argument("--retriever", type=str, help="Retriever to evaluate on")
+    args = parser.parse_args()
+
+    # Pull the dataset from huggingface hub
+    hf_pull.main()
+
+    # Evaluate the model on the dataset
+    harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url)
+    harness.evaluate(args.retriever)
diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh
new file mode 100755
index 00000000..d44cec1b
--- /dev/null
+++ b/evaluation/auto_evaluation/llm_tests.sh
@@ -0,0 +1,16 @@
+#!/bin/bash -eu
+
+retrievers=(
+    "agent-retriever" \
+)
+
+echo "==================================="
+echo "==> Dataset: EDA Corpus"
+for retriever in "${retrievers[@]}" ; do
+    echo "==> Running tests for $retriever"
+    python eval_main.py \
+       --base_url http://localhost:8000 \
+       --dataset ./dataset/EDA_Corpus_100_Question.csv \
+       --retriever $retriever
+done
+echo "==================================="
diff --git a/evaluation/auto_evaluation/retrieval_metrics.json b/evaluation/auto_evaluation/retrieval_metrics.json
deleted file mode 100644
index 085c26e7..00000000
--- a/evaluation/auto_evaluation/retrieval_metrics.json
+++ /dev/null
@@ -1 +0,0 @@
-{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdicts\": [\n            {\n                \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n                \"verdict\": \"yes\",\n                \"reason\": null\n            }\n        ]\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n    \"Customers are eligible for a full refund.\",\n    \"The refund period lasts 30 days.\",\n    \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n    \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": null\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n    {\n        \"verdict\": \"yes\",\n        \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n    }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}}
\ No newline at end of file
diff --git a/evaluation/auto_evaluation/src/metrics/retrieval.py b/evaluation/auto_evaluation/src/metrics/retrieval.py
index cd7d286d..fc6470df 100644
--- a/evaluation/auto_evaluation/src/metrics/retrieval.py
+++ b/evaluation/auto_evaluation/src/metrics/retrieval.py
@@ -35,18 +35,14 @@ def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMet
 def make_contextual_relevancy_metric(
     model: DeepEvalBaseLLM,
 ) -> ContextualRelevancyMetric:
-    return ContextualRelevancyMetric(
-        threshold=RELEVANCY_THRESHOLD,
-        model=model,
-        include_reason=True,
+    raise NotImplementedError(
+        "ContextualRelevancyMetric is not implemented due to protobuf incompatability"
     )
 
 
 def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
-    return FaithfulnessMetric(
-        threshold=FAITHFULNESS_THRESHOLD,
-        model=model,
-        include_reason=True,
+    raise NotImplementedError(
+        "FaithfulnessMetric is not implemented due to protobuf incompatability"
     )
 
 
diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py
index 9d72fcbb..31a64748 100644
--- a/evaluation/auto_evaluation/src/models/vertex_ai.py
+++ b/evaluation/auto_evaluation/src/models/vertex_ai.py
@@ -3,10 +3,16 @@
 Custom DeepEvalLLM wrapper.
 """
 
-from typing import Any
+import instructor
 
-from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
+from typing import Any
+from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory  # type: ignore
 from deepeval.models.base_model import DeepEvalBaseLLM
+from pydantic import BaseModel
+
+
+class Response(BaseModel):
+    content: str
 
 
 class GoogleVertexAILangChain(DeepEvalBaseLLM):
@@ -26,17 +32,43 @@ def load_model(self, *args, **kwargs):
             HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
         }
 
-        return ChatVertexAI(
+        return GenerativeModel(
             model_name=self.model_name,
             safety_settings=safety_settings,
         )
 
-    def generate(self, prompt: str) -> Any:
-        return self.model.invoke(prompt).content
+    def generate(self, prompt: str, schema: BaseModel) -> Any:
+        instructor_client = instructor.from_vertexai(
+            client=self.load_model(),
+            mode=instructor.Mode.VERTEXAI_TOOLS,
+        )
+        resp = instructor_client.messages.create(  # type: ignore
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            response_model=schema,
+        )
+        return resp
 
-    async def a_generate(self, prompt: str) -> Any:
-        response = await self.model.ainvoke(prompt)
-        return response.content
+    async def a_generate(self, prompt: str, schema: BaseModel) -> Any:
+        instructor_client = instructor.from_vertexai(
+            client=self.load_model(),
+            mode=instructor.Mode.VERTEXAI_TOOLS,
+            _async=True,
+        )
+        resp = await instructor_client.messages.create(  # type: ignore
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            response_model=schema,
+        )
+        return resp
 
     def get_model_name(self):
         return self.model_name
@@ -46,7 +78,7 @@ def main():
     model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
     prompt = "Write me a joke"
     print(f"Prompt: {prompt}")
-    response = model.generate(prompt)
+    response = model.generate(prompt, schema=Response)
     print(f"Response: {response}")
 
 
@@ -54,10 +86,14 @@ async def main_async():
     model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
     prompt = "Write me a joke"
     print(f"Prompt: {prompt}")
-    response = await model.a_generate(prompt)
+    response = await model.a_generate(prompt, Response)
     print(f"Response: {response}")
 
 
 if __name__ == "__main__":
-    main()
-    # asyncio.run(main_async())
+    import asyncio
+    from dotenv import load_dotenv
+
+    load_dotenv()
+    # main()
+    asyncio.run(main_async())
diff --git a/evaluation/pyproject.toml b/evaluation/pyproject.toml
index 6c8e7ebe..013e1111 100644
--- a/evaluation/pyproject.toml
+++ b/evaluation/pyproject.toml
@@ -20,6 +20,9 @@ classifiers = [
 dependencies = { file = ["requirements.txt"] }
 optional-dependencies = { test = { file = ["requirements-test.txt"] } }
 
+[tool.setuptools.packages.find]
+include = ["auto_evaluation", "human_evaluation"]
+
 [tool.mypy]
 python_version = "3.12"
 warn_unused_configs = true
diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt
index 22f269d4..96e6f1ae 100644
--- a/evaluation/requirements.txt
+++ b/evaluation/requirements.txt
@@ -12,3 +12,4 @@ deepeval==1.4.9
 langchain-google-vertexai==2.0.6
 asyncio==3.4.3
 huggingface-hub==0.26.2
+instructor[vertexai]==1.5.2