diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 31ff92d0..d6cf4205 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -11,6 +11,10 @@ jobs: build-backend-docker: runs-on: self-hosted steps: + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: '3.12' - name: Checkout code uses: actions/checkout@v4 - name: Setup prereqs @@ -24,10 +28,23 @@ jobs: cp backend/.env.example backend/.env sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env + cp backend/.env evaluation/.env + cp backend/.env frontend/.env cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src + cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src - name: Build Docker image run: | make docker + sleep 900 # TODO: Remove this after docker-compose healthcheck timeout restored fixed. + - name: Run LLM CI + working-directory: evaluation + run: | + make llm-tests + - name: Create commit comment + uses: peter-evans/commit-comment@v3 + with: + token: ${{ secrets.GH_PAT }} + body-path: evaluation/auto_evaluation/llm_tests_output.txt - name: Teardown if: always() run: | diff --git a/.gitignore b/.gitignore index 10bfa179..14a3dc29 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ backend/data/* backend/src/*.json *.pyc +*.egg-info/ frontend/*.json evaluation/human_evaluation/*.json /*.json @@ -21,7 +22,8 @@ documents.txt .venv # evaluations -.deepeval_telemtry.txt +**/.deepeval_telemtry.txt *.csv -*.deepeval-cache.json +**/.deepeval-cache.json temp_test_run_data.json +**/llm_tests_output.txt diff --git a/Makefile b/Makefile index 1ebc3f65..1c6a81fa 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,6 @@ -FOLDERS=backend frontend +.PHONY: init init-dev format check + +FOLDERS=backend frontend evaluation init: @for folder in $(FOLDERS); do (cd $$folder && make init && cd ../); done diff --git a/backend/Dockerfile b/backend/Dockerfile index bc6e29f1..bc6737f8 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py EXPOSE 8000 -CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] diff --git a/backend/src/api/routers/graphs.py b/backend/src/api/routers/graphs.py index 0666ab78..93e8b13f 100644 --- a/backend/src/api/routers/graphs.py +++ b/backend/src/api/routers/graphs.py @@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse: tool_index = 1 for tool in tools: urls.extend(list(output[tool_index].values())[0]["urls"]) - context.extend(list(set(list(output[tool_index].values())[0]["context"]))) + context.append(list(output[tool_index].values())[0]["context"]) tool_index += 1 else: llm_response = "LLM response extraction failed" diff --git a/backend/src/tools/format_docs.py b/backend/src/tools/format_docs.py index bcd9fbf8..a2376c41 100644 --- a/backend/src/tools/format_docs.py +++ b/backend/src/tools/format_docs.py @@ -5,7 +5,7 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]: doc_text = "" - doc_texts = "" + doc_texts = [] doc_urls = [] doc_srcs = [] @@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]: doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}" else: doc_text = doc.page_content + doc_texts.append(doc_text) if "url" in doc.metadata: doc_urls.append(doc.metadata["url"]) + + doc_output = "\n\n -------------------------- \n\n".join(doc_texts) - doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}" - - return doc_texts, doc_srcs, doc_urls + return doc_output, doc_srcs, doc_urls diff --git a/evaluation/Makefile b/evaluation/Makefile index 72878508..d0dd015f 100644 --- a/evaluation/Makefile +++ b/evaluation/Makefile @@ -1,7 +1,10 @@ +.PHONY: init init-dev format check clean + init: @python3 -m venv .venv && \ . .venv/bin/activate && \ - pip install -r requirements.txt + pip install -r requirements.txt && \ + pip install -e . init-dev: init @. .venv/bin/activate && \ @@ -15,3 +18,12 @@ format: check: @. .venv/bin/activate && \ ruff check --fix + +clean: + @rm -f llm_tests_output.txt + @rm -f **/.deepeval-cache.json + +llm-tests: clean + @. .venv/bin/activate && \ + cd auto_evaluation && \ + ./llm_tests.sh 2>&1 | tee llm_tests_output.txt diff --git a/evaluation/auto_evaluation/__init__.py b/evaluation/auto_evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluation/auto_evaluation/content_metrics.json b/evaluation/auto_evaluation/content_metrics.json deleted file mode 100644 index 274e99ee..00000000 --- a/evaluation/auto_evaluation/content_metrics.json +++ /dev/null @@ -1 +0,0 @@ -{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}, "{\"actual_output\": \"The capital of France is Paris.\", \"context\": null, \"expected_output\": \"Paris.\", \"hyperparameters\": null, \"input\": \"What is the capital of France?\", \"retrieval_context\": null}": {"cached_metrics_data": [{"metric_data": {"name": "Answer Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the response is perfectly relevant, addressing the input directly and completely with no irrelevant information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Statements:\n[\n \"The capital of France is Paris.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Bias", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output demonstrates no discernible bias.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Toxicity", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the output is entirely harmless and positive.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Opinions:\n[] \n \nVerdicts:\n[]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}} \ No newline at end of file diff --git a/evaluation/auto_evaluation/dataset/hf_pull.py b/evaluation/auto_evaluation/dataset/hf_pull.py index c0541cbf..8df9a062 100644 --- a/evaluation/auto_evaluation/dataset/hf_pull.py +++ b/evaluation/auto_evaluation/dataset/hf_pull.py @@ -1,7 +1,8 @@ from huggingface_hub import snapshot_download import os -if __name__ == "__main__": + +def main(): cur_dir = os.path.dirname(os.path.abspath(__file__)) snapshot_download( "The-OpenROAD-Project/ORAssistant_Public_Evals", @@ -13,3 +14,7 @@ "README.md", ], ) + + +if __name__ == "__main__": + main() diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py new file mode 100644 index 00000000..3a7c6f9f --- /dev/null +++ b/evaluation/auto_evaluation/dataset/preprocess.py @@ -0,0 +1,60 @@ +import csv +import json +from typing import Any + + +def read_data(csv_file: str) -> list[dict]: + questions = [] + with open(csv_file, "r") as f: + reader = csv.reader(f) + header = next(reader) # Skip the header row + assert len(header) == 2, "CSV file must have exactly 2 columns" + for row in reader: + questions.append( + {"question": row[0].strip(), "ground_truth": row[1].strip()} + ) + return questions + + +def write_data(results_list: list[dict[str, Any]], results_path: str): + keys = results_list[0].keys() + with open(results_path, "w") as f: + writer = csv.writer(f) + writer.writerow(list(keys)) + for result in results_list: + writer.writerow([result[key] for key in keys]) + print(f"Results written to {results_path}") + + +def read_deepeval_cache(): + metric_scores = { + "Contextual Precision": [], + "Contextual Recall": [], + "Hallucination": [], + } + metric_passes = { + "Contextual Precision": [], + "Contextual Recall": [], + "Hallucination": [], + } + with open(".deepeval-cache.json") as f: + results = json.load(f) + for _, value in results["test_cases_lookup_map"].items(): + for metric in value["cached_metrics_data"]: + metric_scores[metric["metric_data"]["name"]].append( + metric["metric_data"]["score"] + ) + metric_passes[metric["metric_data"]["name"]].append( + metric["metric_data"]["success"] + ) + + print("Average Metric Scores: ") + for key, value in metric_scores.items(): + print(key, sum(value) / len(value)) + print("Metric Passrates: ") + for key, value in metric_passes.items(): + print(key, value.count(True) / len(value)) + + +if __name__ == "__main__": + read_deepeval_cache() diff --git a/evaluation/auto_evaluation/demo.py b/evaluation/auto_evaluation/demo.py deleted file mode 100644 index 7b7b909f..00000000 --- a/evaluation/auto_evaluation/demo.py +++ /dev/null @@ -1,64 +0,0 @@ -import os - -from dotenv import load_dotenv -from src.models.vertex_ai import GoogleVertexAILangChain - -# from src.metrics.geval import make_correctness_metric -from src.metrics.content import ( - make_bias_metric, - make_toxicity_metric, - make_answer_relevancy_metric, -) -from src.metrics.retrieval import ( - make_contextual_precision_metric, - make_contextual_recall_metric, - make_contextual_relevancy_metric, - make_faithfulness_metric, - make_hallucination_metric, -) -from deepeval.test_case import LLMTestCase -from deepeval import evaluate - -cur_dir = os.path.dirname(__file__) -root_dir = os.path.join(cur_dir, "../../") -load_dotenv(os.path.join(root_dir, ".env")) - -if __name__ == "__main__": - model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") - print("Retrieval metrics") - precision, recall, relevancy, faithfulness, hallucination = ( - make_contextual_precision_metric(model), - make_contextual_recall_metric(model), - make_contextual_relevancy_metric(model), - make_faithfulness_metric(model), - make_hallucination_metric(model), - ) - - test_case = LLMTestCase( - input="What if these shoes don't fit?", - actual_output="We offer a 30-day full refund at no extra cost.", - expected_output="You are eligible for a 30 day full refund at no extra cost.", - context=[ - "All customers are eligible for a 30 day full refund at no extra cost." - ], - retrieval_context=[ - "All customers are eligible for a 30 day full refund at no extra cost." - ], - ) - evaluate([test_case], [precision, recall, relevancy, faithfulness, hallucination]) - os.rename(".deepeval-cache.json", "retrieval_metrics.json") - - print("Content metrics") - answer_relevancy, bias, toxicity = ( - make_answer_relevancy_metric(model), - make_bias_metric(model), - make_toxicity_metric(model), - ) - - test_case = LLMTestCase( - input="What is the capital of France?", - actual_output="The capital of France is Paris.", - expected_output="Paris.", - ) - evaluate([test_case], [answer_relevancy, bias, toxicity]) - os.rename(".deepeval-cache.json", "content_metrics.json") diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py new file mode 100644 index 00000000..aac81503 --- /dev/null +++ b/evaluation/auto_evaluation/eval_main.py @@ -0,0 +1,144 @@ +""" +Evaluation script which takes in arguments to dataset and +the model to evaluate on the dataset. +""" + +import argparse +import time +import requests +import os + +from dotenv import load_dotenv +from deepeval.test_case import LLMTestCase +from deepeval import evaluate + +from auto_evaluation.src.models.vertex_ai import GoogleVertexAILangChain +from auto_evaluation.src.metrics.retrieval import ( + make_contextual_precision_metric, + make_contextual_recall_metric, + make_hallucination_metric, +) +from auto_evaluation.dataset import hf_pull, preprocess +from tqdm import tqdm # type: ignore + +eval_root_path = os.path.join(os.path.dirname(__file__), "..") +load_dotenv(dotenv_path=os.path.join(eval_root_path, ".env")) + +# List of all available retrievers +ALL_RETRIEVERS = { + "agent-retriever": "/graphs/agent-retriever", + "agent-retriever-reranker": "/graphs/agent-retriever", + "hybrid": "/graphs/hybrid", + "sim": "/graphs/sim", + "ensemble": "/graphs/ensemble", +} + + +class EvaluationHarness: + # TODO: Use async for EvaluationHarness. + # TODO: Also requires LLM Engine to be async + def __init__(self, base_url: str, dataset: str, reranker_base_url: str = ""): + self.base_url = base_url + self.dataset = dataset + self.reranker_base_url = reranker_base_url + self.qns = preprocess.read_data(self.dataset) + self.eval_model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") + self.log_dir = "logs" + os.makedirs(self.log_dir, exist_ok=True) + self.sanity_check() + + def sanity_check(self): + if not requests.get(f"{self.base_url}/healthcheck").status_code == 200: + raise ValueError("Endpoint is not running") + if not os.path.exists(self.dataset): + raise ValueError("Dataset path does not exist") + if ( + self.reranker_base_url + and not requests.get(f"{self.reranker_base_url}/healthcheck").status_code + == 200 + ): + raise ValueError("Reranker endpoint is not running") + + def evaluate(self, retriever: str): + retrieval_tcs = [] + response_times = [] + + # metrics + precision, recall, hallucination = ( + make_contextual_precision_metric(self.eval_model), + make_contextual_recall_metric(self.eval_model), + make_hallucination_metric(self.eval_model), + ) + + # retrieval test cases + for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")): + question, ground_truth = qa_pair["question"], qa_pair["ground_truth"] + response, response_time = self.query(retriever, question) + response_text = response["response"] + context = response["context"] + context_list = context[0].split("--------------------------") + + # works for: precision, recall, hallucination + retrieval_tc = LLMTestCase( + input=question, + actual_output=response_text, + expected_output=ground_truth, + context=context_list, + retrieval_context=context_list, + ) + retrieval_tcs.append(retrieval_tc) + response_times.append(response_time) + + # parallel evaluate + evaluate( + retrieval_tcs, + [precision, recall, hallucination], + print_results=False, + ) + + # parse deepeval results + preprocess.read_deepeval_cache() + + def query(self, retriever: str, query: str) -> tuple[dict, float]: + """ + Returns the response json and the time taken to get the response (ms) + """ + endpoint = ALL_RETRIEVERS[retriever] + url = ( + f"{self.base_url}/{endpoint}" + if retriever != "agent-retriever-reranker" + else f"{self.reranker_base_url}/{endpoint}" + ) + payload = {"query": query, "list_context": True, "list_sources": False} + try: + time.sleep(5) + response = requests.post(url, json=payload) + return response.json(), response.elapsed.total_seconds() * 1000 + except Exception as e: + print(f"Error querying {retriever}: {e}") + return { + "response": "invalid", + "sources": [], + "context": [], + "tool": "string", + }, -999999 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Evaluation script") + parser.add_argument( + "--base_url", type=str, help="Base URL of the model to evaluate" + ) + parser.add_argument( + "--reranker_base_url", type=str, help="Base URL of the reranker", default="" + ) + parser.add_argument("--dataset", type=str, help="Path to dataset to evaluate on") + parser.add_argument("--retriever", type=str, help="Retriever to evaluate on") + args = parser.parse_args() + + # Pull the dataset from huggingface hub + hf_pull.main() + + # Evaluate the model on the dataset + harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url) + harness.evaluate(args.retriever) diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh new file mode 100755 index 00000000..d44cec1b --- /dev/null +++ b/evaluation/auto_evaluation/llm_tests.sh @@ -0,0 +1,16 @@ +#!/bin/bash -eu + +retrievers=( + "agent-retriever" \ +) + +echo "===================================" +echo "==> Dataset: EDA Corpus" +for retriever in "${retrievers[@]}" ; do + echo "==> Running tests for $retriever" + python eval_main.py \ + --base_url http://localhost:8000 \ + --dataset ./dataset/EDA_Corpus_100_Question.csv \ + --retriever $retriever +done +echo "===================================" diff --git a/evaluation/auto_evaluation/retrieval_metrics.json b/evaluation/auto_evaluation/retrieval_metrics.json deleted file mode 100644 index 085c26e7..00000000 --- a/evaluation/auto_evaluation/retrieval_metrics.json +++ /dev/null @@ -1 +0,0 @@ -{"test_cases_lookup_map": {"{\"actual_output\": \"We offer a 30-day full refund at no extra cost.\", \"context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"], \"expected_output\": \"You are eligible for a 30 day full refund at no extra cost.\", \"hyperparameters\": null, \"input\": \"What if these shoes don't fit?\", \"retrieval_context\": [\"All customers are eligible for a 30 day full refund at no extra cost.\"]}": {"cached_metrics_data": [{"metric_data": {"name": "Contextual Precision", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because all relevant information was retrieved and ranked appropriately. Great job!", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context directly answers the input question about what happens if shoes don't fit by stating 'All customers are eligible for a 30 day full refund at no extra cost.'\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Recall", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the generated output perfectly reflects the information provided in node 1 in the retrieval context, regarding the 30-day full refund policy.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"This sentence is a paraphrase of the 1st node in the retrieval context, which states \\\"All customers are eligible for a 30 day full refund at no extra cost.\\\"\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Contextual Relevancy", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the retrieval context directly addresses the user's concern about the shoes not fitting by stating that 'All customers are eligible for a 30 day full refund at no extra cost.'", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdicts\": [\n {\n \"statement\": \"All customers are eligible for a 30 day full refund at no extra cost.\",\n \"verdict\": \"yes\",\n \"reason\": null\n }\n ]\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Faithfulness", "threshold": 0.7, "success": true, "score": 1.0, "reason": "The score is 1.00 because the actual output perfectly aligns with the retrieval context, as evidenced by the absence of any contradictions.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Truths (limit=None):\n[\n \"Customers are eligible for a full refund.\",\n \"The refund period lasts 30 days.\",\n \"There is no extra cost for the refund.\"\n] \n \nClaims:\n[\n \"We offer a 30-day full refund at no extra cost.\"\n] \n \nVerdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": null\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}, {"metric_data": {"name": "Hallucination", "threshold": 0.7, "success": true, "score": 0.0, "reason": "The score is 0.00 because the actual output is fully supported by the provided context and doesn't introduce any contradictory or unsubstantiated information.", "strictMode": false, "evaluationModel": "gemini-1.5-pro-002", "evaluationCost": 0, "verboseLogs": "Verdicts:\n[\n {\n \"verdict\": \"yes\",\n \"reason\": \"The actual output agrees with the provided context. While the context mentions \\\"all customers\\\", the actual output implies the same by stating \\\"we offer\\\" a 30-day full refund at no extra cost. This phrasing suggests a general policy applicable to all customers.\"\n }\n]"}, "metric_configuration": {"threshold": 0.7, "evaluation_model": "gemini-1.5-pro-002", "strict_mode": false, "include_reason": true}}]}}} \ No newline at end of file diff --git a/evaluation/auto_evaluation/src/metrics/retrieval.py b/evaluation/auto_evaluation/src/metrics/retrieval.py index cd7d286d..fc6470df 100644 --- a/evaluation/auto_evaluation/src/metrics/retrieval.py +++ b/evaluation/auto_evaluation/src/metrics/retrieval.py @@ -35,18 +35,14 @@ def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMet def make_contextual_relevancy_metric( model: DeepEvalBaseLLM, ) -> ContextualRelevancyMetric: - return ContextualRelevancyMetric( - threshold=RELEVANCY_THRESHOLD, - model=model, - include_reason=True, + raise NotImplementedError( + "ContextualRelevancyMetric is not implemented due to protobuf incompatability" ) def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric: - return FaithfulnessMetric( - threshold=FAITHFULNESS_THRESHOLD, - model=model, - include_reason=True, + raise NotImplementedError( + "FaithfulnessMetric is not implemented due to protobuf incompatability" ) diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py index 9d72fcbb..31a64748 100644 --- a/evaluation/auto_evaluation/src/models/vertex_ai.py +++ b/evaluation/auto_evaluation/src/models/vertex_ai.py @@ -3,10 +3,16 @@ Custom DeepEvalLLM wrapper. """ -from typing import Any +import instructor -from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory +from typing import Any +from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory # type: ignore from deepeval.models.base_model import DeepEvalBaseLLM +from pydantic import BaseModel + + +class Response(BaseModel): + content: str class GoogleVertexAILangChain(DeepEvalBaseLLM): @@ -26,17 +32,43 @@ def load_model(self, *args, **kwargs): HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, } - return ChatVertexAI( + return GenerativeModel( model_name=self.model_name, safety_settings=safety_settings, ) - def generate(self, prompt: str) -> Any: - return self.model.invoke(prompt).content + def generate(self, prompt: str, schema: BaseModel) -> Any: + instructor_client = instructor.from_vertexai( + client=self.load_model(), + mode=instructor.Mode.VERTEXAI_TOOLS, + ) + resp = instructor_client.messages.create( # type: ignore + messages=[ + { + "role": "user", + "content": prompt, + } + ], + response_model=schema, + ) + return resp - async def a_generate(self, prompt: str) -> Any: - response = await self.model.ainvoke(prompt) - return response.content + async def a_generate(self, prompt: str, schema: BaseModel) -> Any: + instructor_client = instructor.from_vertexai( + client=self.load_model(), + mode=instructor.Mode.VERTEXAI_TOOLS, + _async=True, + ) + resp = await instructor_client.messages.create( # type: ignore + messages=[ + { + "role": "user", + "content": prompt, + } + ], + response_model=schema, + ) + return resp def get_model_name(self): return self.model_name @@ -46,7 +78,7 @@ def main(): model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") prompt = "Write me a joke" print(f"Prompt: {prompt}") - response = model.generate(prompt) + response = model.generate(prompt, schema=Response) print(f"Response: {response}") @@ -54,10 +86,14 @@ async def main_async(): model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002") prompt = "Write me a joke" print(f"Prompt: {prompt}") - response = await model.a_generate(prompt) + response = await model.a_generate(prompt, Response) print(f"Response: {response}") if __name__ == "__main__": - main() - # asyncio.run(main_async()) + import asyncio + from dotenv import load_dotenv + + load_dotenv() + # main() + asyncio.run(main_async()) diff --git a/evaluation/pyproject.toml b/evaluation/pyproject.toml index 6c8e7ebe..013e1111 100644 --- a/evaluation/pyproject.toml +++ b/evaluation/pyproject.toml @@ -20,6 +20,9 @@ classifiers = [ dependencies = { file = ["requirements.txt"] } optional-dependencies = { test = { file = ["requirements-test.txt"] } } +[tool.setuptools.packages.find] +include = ["auto_evaluation", "human_evaluation"] + [tool.mypy] python_version = "3.12" warn_unused_configs = true diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt index 22f269d4..96e6f1ae 100644 --- a/evaluation/requirements.txt +++ b/evaluation/requirements.txt @@ -12,3 +12,4 @@ deepeval==1.4.9 langchain-google-vertexai==2.0.6 asyncio==3.4.3 huggingface-hub==0.26.2 +instructor[vertexai]==1.5.2