Skip to content

Commit

Permalink
DeepEval CI integration (#86)
Browse files Browse the repository at this point in the history
* initial scaffold for deepeval integration + remove unnecessary files

* add llm_tests target and CI

* fix CI syntax, seed env for evaluation

* use python3.12 for default workflow

* fix CI -> GH_PAT and list_contexts issue
* add instructor for enforcing json llm outputs in deepeval
* silent deepeval outputs
* add json deepeval-cache parser
* set two deepeval metrics as notimplemented -protobuf error

* populate env variables in 1 step

* Copy secret.json to correct path

* add sleep to make sure docker comes up and is alive

* evaluate all qns

* cleanup and use average stats

---------

Signed-off-by: Jack Luar <[email protected]>
  • Loading branch information
luarss authored Nov 10, 2024
1 parent b850af4 commit 260bba5
Show file tree
Hide file tree
Showing 19 changed files with 326 additions and 97 deletions.
17 changes: 17 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ jobs:
build-backend-docker:
runs-on: self-hosted
steps:
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Checkout code
uses: actions/checkout@v4
- name: Setup prereqs
Expand All @@ -24,10 +28,23 @@ jobs:
cp backend/.env.example backend/.env
sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env
sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env
cp backend/.env evaluation/.env
cp backend/.env frontend/.env
cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src
cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src
- name: Build Docker image
run: |
make docker
sleep 900 # TODO: Remove this after docker-compose healthcheck timeout restored fixed.
- name: Run LLM CI
working-directory: evaluation
run: |
make llm-tests
- name: Create commit comment
uses: peter-evans/commit-comment@v3
with:
token: ${{ secrets.GH_PAT }}
body-path: evaluation/auto_evaluation/llm_tests_output.txt
- name: Teardown
if: always()
run: |
Expand Down
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ __pycache__/
backend/data/*
backend/src/*.json
*.pyc
*.egg-info/
frontend/*.json
evaluation/human_evaluation/*.json
/*.json
Expand All @@ -21,7 +22,8 @@ documents.txt
.venv

# evaluations
.deepeval_telemtry.txt
**/.deepeval_telemtry.txt
*.csv
*.deepeval-cache.json
**/.deepeval-cache.json
temp_test_run_data.json
**/llm_tests_output.txt
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
FOLDERS=backend frontend
.PHONY: init init-dev format check

FOLDERS=backend frontend evaluation

init:
@for folder in $(FOLDERS); do (cd $$folder && make init && cd ../); done
Expand Down
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py

EXPOSE 8000

CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
2 changes: 1 addition & 1 deletion backend/src/api/routers/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse:
tool_index = 1
for tool in tools:
urls.extend(list(output[tool_index].values())[0]["urls"])
context.extend(list(set(list(output[tool_index].values())[0]["context"])))
context.append(list(output[tool_index].values())[0]["context"])
tool_index += 1
else:
llm_response = "LLM response extraction failed"
Expand Down
9 changes: 5 additions & 4 deletions backend/src/tools/format_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
doc_text = ""
doc_texts = ""
doc_texts = []
doc_urls = []
doc_srcs = []

Expand All @@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}"
else:
doc_text = doc.page_content
doc_texts.append(doc_text)

if "url" in doc.metadata:
doc_urls.append(doc.metadata["url"])

doc_output = "\n\n -------------------------- \n\n".join(doc_texts)

doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}"

return doc_texts, doc_srcs, doc_urls
return doc_output, doc_srcs, doc_urls
14 changes: 13 additions & 1 deletion evaluation/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
.PHONY: init init-dev format check clean

init:
@python3 -m venv .venv && \
. .venv/bin/activate && \
pip install -r requirements.txt
pip install -r requirements.txt && \
pip install -e .

init-dev: init
@. .venv/bin/activate && \
Expand All @@ -15,3 +18,12 @@ format:
check:
@. .venv/bin/activate && \
ruff check --fix

clean:
@rm -f llm_tests_output.txt
@rm -f **/.deepeval-cache.json

llm-tests: clean
@. .venv/bin/activate && \
cd auto_evaluation && \
./llm_tests.sh 2>&1 | tee llm_tests_output.txt
Empty file.
1 change: 0 additions & 1 deletion evaluation/auto_evaluation/content_metrics.json

This file was deleted.

7 changes: 6 additions & 1 deletion evaluation/auto_evaluation/dataset/hf_pull.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from huggingface_hub import snapshot_download
import os

if __name__ == "__main__":

def main():
cur_dir = os.path.dirname(os.path.abspath(__file__))
snapshot_download(
"The-OpenROAD-Project/ORAssistant_Public_Evals",
Expand All @@ -13,3 +14,7 @@
"README.md",
],
)


if __name__ == "__main__":
main()
60 changes: 60 additions & 0 deletions evaluation/auto_evaluation/dataset/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import csv
import json
from typing import Any


def read_data(csv_file: str) -> list[dict]:
questions = []
with open(csv_file, "r") as f:
reader = csv.reader(f)
header = next(reader) # Skip the header row
assert len(header) == 2, "CSV file must have exactly 2 columns"
for row in reader:
questions.append(
{"question": row[0].strip(), "ground_truth": row[1].strip()}
)
return questions


def write_data(results_list: list[dict[str, Any]], results_path: str):
keys = results_list[0].keys()
with open(results_path, "w") as f:
writer = csv.writer(f)
writer.writerow(list(keys))
for result in results_list:
writer.writerow([result[key] for key in keys])
print(f"Results written to {results_path}")


def read_deepeval_cache():
metric_scores = {
"Contextual Precision": [],
"Contextual Recall": [],
"Hallucination": [],
}
metric_passes = {
"Contextual Precision": [],
"Contextual Recall": [],
"Hallucination": [],
}
with open(".deepeval-cache.json") as f:
results = json.load(f)
for _, value in results["test_cases_lookup_map"].items():
for metric in value["cached_metrics_data"]:
metric_scores[metric["metric_data"]["name"]].append(
metric["metric_data"]["score"]
)
metric_passes[metric["metric_data"]["name"]].append(
metric["metric_data"]["success"]
)

print("Average Metric Scores: ")
for key, value in metric_scores.items():
print(key, sum(value) / len(value))
print("Metric Passrates: ")
for key, value in metric_passes.items():
print(key, value.count(True) / len(value))


if __name__ == "__main__":
read_deepeval_cache()
64 changes: 0 additions & 64 deletions evaluation/auto_evaluation/demo.py

This file was deleted.

Loading

1 comment on commit 260bba5

@luarss
Copy link
Collaborator Author

@luarss luarss commented on 260bba5 Nov 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

===================================
==> Dataset: EDA Corpus
==> Running tests for agent-retriever
/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/deepeval/init.py:49: UserWarning: You are using deepeval version 1.4.9, however version 1.5.0 is available. You should consider upgrading via the "pip install --upgrade deepeval" command.
warnings.warn(

Fetching 2 files: 0%| | 0/2 [00:00<?, ?it/s]
Fetching 2 files: 50%|█████ | 1/2 [00:00<00:00, 4.68it/s]
Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 6.53it/s]
Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 6.16it/s]

Evaluating: 0%| | 0/100 [00:00<?, ?it/s]
Evaluating: 1%| | 1/100 [00:13<22:30, 13.64s/it]
Evaluating: 2%|▏ | 2/100 [00:24<19:37, 12.02s/it]
Evaluating: 3%|▎ | 3/100 [00:38<20:46, 12.85s/it]
Evaluating: 4%|▍ | 4/100 [00:47<18:12, 11.38s/it]
Evaluating: 5%|▌ | 5/100 [00:59<18:26, 11.65s/it]
Evaluating: 6%|▌ | 6/100 [01:10<17:51, 11.40s/it]
Evaluating: 7%|▋ | 7/100 [01:21<17:39, 11.39s/it]
Evaluating: 8%|▊ | 8/100 [01:34<17:54, 11.67s/it]
Evaluating: 9%|▉ | 9/100 [01:46<18:08, 11.97s/it]
Evaluating: 10%|█ | 10/100 [01:56<16:58, 11.32s/it]
Evaluating: 11%|█ | 11/100 [02:07<16:21, 11.03s/it]
Evaluating: 12%|█▏ | 12/100 [02:16<15:35, 10.63s/it]
Evaluating: 13%|█▎ | 13/100 [02:29<16:14, 11.20s/it]
Evaluating: 14%|█▍ | 14/100 [02:40<16:13, 11.33s/it]
Evaluating: 15%|█▌ | 15/100 [02:51<15:42, 11.09s/it]
Evaluating: 16%|█▌ | 16/100 [03:03<15:54, 11.36s/it]
Evaluating: 17%|█▋ | 17/100 [03:15<15:57, 11.54s/it]
Evaluating: 18%|█▊ | 18/100 [03:26<15:29, 11.34s/it]
Evaluating: 19%|█▉ | 19/100 [03:36<14:54, 11.05s/it]
Evaluating: 20%|██ | 20/100 [03:47<14:41, 11.02s/it]
Evaluating: 21%|██ | 21/100 [03:58<14:25, 10.95s/it]
Evaluating: 22%|██▏ | 22/100 [04:09<14:19, 11.02s/it]
Evaluating: 23%|██▎ | 23/100 [04:20<14:07, 11.01s/it]
Evaluating: 24%|██▍ | 24/100 [04:30<13:34, 10.71s/it]
Evaluating: 25%|██▌ | 25/100 [04:40<13:05, 10.47s/it]
Evaluating: 26%|██▌ | 26/100 [04:50<12:37, 10.23s/it]
Evaluating: 27%|██▋ | 27/100 [05:00<12:37, 10.38s/it]
Evaluating: 28%|██▊ | 28/100 [05:12<12:46, 10.64s/it]
Evaluating: 29%|██▉ | 29/100 [05:23<12:47, 10.81s/it]
Evaluating: 30%|███ | 30/100 [05:35<13:07, 11.25s/it]
Evaluating: 31%|███ | 31/100 [05:48<13:23, 11.65s/it]
Evaluating: 32%|███▏ | 32/100 [06:00<13:23, 11.81s/it]
Evaluating: 33%|███▎ | 33/100 [06:11<13:06, 11.74s/it]
Evaluating: 34%|███▍ | 34/100 [06:23<12:54, 11.74s/it]
Evaluating: 35%|███▌ | 35/100 [06:34<12:26, 11.48s/it]
Evaluating: 36%|███▌ | 36/100 [06:46<12:22, 11.60s/it]
Evaluating: 37%|███▋ | 37/100 [07:00<12:53, 12.28s/it]
Evaluating: 38%|███▊ | 38/100 [07:14<13:14, 12.81s/it]
Evaluating: 39%|███▉ | 39/100 [07:25<12:29, 12.29s/it]
Evaluating: 40%|████ | 40/100 [07:36<11:54, 11.90s/it]
Evaluating: 41%|████ | 41/100 [07:47<11:27, 11.64s/it]
Evaluating: 42%|████▏ | 42/100 [07:59<11:27, 11.85s/it]
Evaluating: 43%|████▎ | 43/100 [08:10<11:00, 11.59s/it]
Evaluating: 44%|████▍ | 44/100 [08:22<10:47, 11.57s/it]
Evaluating: 45%|████▌ | 45/100 [08:34<10:43, 11.70s/it]
Evaluating: 46%|████▌ | 46/100 [08:47<10:53, 12.09s/it]
Evaluating: 47%|████▋ | 47/100 [09:00<10:56, 12.38s/it]
Evaluating: 48%|████▊ | 48/100 [09:12<10:35, 12.23s/it]
Evaluating: 49%|████▉ | 49/100 [09:22<09:59, 11.76s/it]
Evaluating: 50%|█████ | 50/100 [09:35<09:54, 11.90s/it]
Evaluating: 51%|█████ | 51/100 [09:45<09:16, 11.35s/it]
Evaluating: 52%|█████▏ | 52/100 [09:57<09:22, 11.71s/it]
Evaluating: 53%|█████▎ | 53/100 [10:09<09:10, 11.71s/it]
Evaluating: 54%|█████▍ | 54/100 [10:21<09:06, 11.89s/it]
Evaluating: 55%|█████▌ | 55/100 [10:31<08:31, 11.37s/it]
Evaluating: 56%|█████▌ | 56/100 [10:42<08:04, 11.01s/it]
Evaluating: 57%|█████▋ | 57/100 [10:53<07:56, 11.08s/it]
Evaluating: 58%|█████▊ | 58/100 [11:04<07:45, 11.09s/it]
Evaluating: 59%|█████▉ | 59/100 [11:15<07:34, 11.08s/it]
Evaluating: 60%|██████ | 60/100 [11:27<07:34, 11.37s/it]
Evaluating: 61%|██████ | 61/100 [11:39<07:31, 11.58s/it]
Evaluating: 62%|██████▏ | 62/100 [11:50<07:16, 11.50s/it]
Evaluating: 63%|██████▎ | 63/100 [12:01<06:54, 11.20s/it]
Evaluating: 64%|██████▍ | 64/100 [12:09<06:14, 10.39s/it]
Evaluating: 65%|██████▌ | 65/100 [12:20<06:06, 10.46s/it]
Evaluating: 66%|██████▌ | 66/100 [12:32<06:06, 10.79s/it]
Evaluating: 67%|██████▋ | 67/100 [12:42<05:56, 10.79s/it]
Evaluating: 68%|██████▊ | 68/100 [12:53<05:43, 10.75s/it]
Evaluating: 69%|██████▉ | 69/100 [13:04<05:32, 10.73s/it]
Evaluating: 70%|███████ | 70/100 [13:14<05:17, 10.58s/it]
Evaluating: 71%|███████ | 71/100 [13:24<05:05, 10.54s/it]
Evaluating: 72%|███████▏ | 72/100 [13:35<04:57, 10.63s/it]
Evaluating: 73%|███████▎ | 73/100 [13:48<04:59, 11.11s/it]
Evaluating: 74%|███████▍ | 74/100 [13:59<04:53, 11.28s/it]
Evaluating: 75%|███████▌ | 75/100 [14:11<04:43, 11.36s/it]
Evaluating: 76%|███████▌ | 76/100 [14:22<04:31, 11.31s/it]
Evaluating: 77%|███████▋ | 77/100 [14:33<04:18, 11.22s/it]
Evaluating: 78%|███████▊ | 78/100 [14:45<04:11, 11.41s/it]
Evaluating: 79%|███████▉ | 79/100 [14:55<03:51, 11.02s/it]
Evaluating: 80%|████████ | 80/100 [15:05<03:37, 10.87s/it]
Evaluating: 81%|████████ | 81/100 [15:16<03:25, 10.83s/it]
Evaluating: 82%|████████▏ | 82/100 [15:26<03:11, 10.64s/it]
Evaluating: 83%|████████▎ | 83/100 [15:36<02:58, 10.48s/it]
Evaluating: 84%|████████▍ | 84/100 [15:48<02:54, 10.91s/it]
Evaluating: 85%|████████▌ | 85/100 [15:58<02:35, 10.38s/it]
Evaluating: 86%|████████▌ | 86/100 [16:10<02:32, 10.92s/it]
Evaluating: 87%|████████▋ | 87/100 [16:20<02:19, 10.76s/it]
Evaluating: 88%|████████▊ | 88/100 [16:32<02:12, 11.08s/it]
Evaluating: 89%|████████▉ | 89/100 [16:42<02:00, 10.91s/it]
Evaluating: 90%|█████████ | 90/100 [16:55<01:54, 11.41s/it]
Evaluating: 91%|█████████ | 91/100 [17:08<01:47, 11.90s/it]
Evaluating: 92%|█████████▏| 92/100 [17:21<01:37, 12.14s/it]
Evaluating: 93%|█████████▎| 93/100 [17:37<01:33, 13.37s/it]
Evaluating: 94%|█████████▍| 94/100 [17:50<01:19, 13.25s/it]
Evaluating: 95%|█████████▌| 95/100 [18:04<01:07, 13.58s/it]
Evaluating: 96%|█████████▌| 96/100 [18:15<00:50, 12.74s/it]
Evaluating: 97%|█████████▋| 97/100 [18:27<00:37, 12.59s/it]
Evaluating: 98%|█████████▊| 98/100 [18:37<00:23, 11.80s/it]
Evaluating: 99%|█████████▉| 99/100 [18:48<00:11, 11.54s/it]
Evaluating: 100%|██████████| 100/100 [19:00<00:00, 11.70s/it]
Evaluating: 100%|██████████| 100/100 [19:00<00:00, 11.41s/it]
✨ You're running DeepEval's latest Contextual Precision Metric! (using
gemini-1.5-pro-002, strict=False, async_mode=True)...
✨ You're running DeepEval's latest Contextual Recall Metric! (using
gemini-1.5-pro-002, strict=False, async_mode=True)...
✨ You're running DeepEval's latest Hallucination Metric! (using
gemini-1.5-pro-002, strict=False, async_mode=True)...

Evaluating 100 test case(s) in parallel: | | 0% (0/100) [Time Taken: 00:00, ?test case/s]
Evaluating 100 test case(s) in parallel: | | 1% (1/100) [Time Taken: 00:09, 9.22s/test case]
Evaluating 100 test case(s) in parallel: |▏ | 2% (2/100) [Time Taken: 00:11, 5.14s/test case]
Evaluating 100 test case(s) in parallel: |▎ | 3% (3/100) [Time Taken: 00:12, 3.08s/test case]
Evaluating 100 test case(s) in parallel: |▍ | 4% (4/100) [Time Taken: 00:12, 1.93s/test case]
Evaluating 100 test case(s) in parallel: |▊ | 8% (8/100) [Time Taken: 00:12, 1.54test case/s]
Evaluating 100 test case(s) in parallel: |▉ | 9% (9/100) [Time Taken: 00:12, 1.82test case/s]
Evaluating 100 test case(s) in parallel: |█ | 10% (10/100) [Time Taken: 00:12, 2.20test case/s]
Evaluating 100 test case(s) in parallel: |█ | 11% (11/100) [Time Taken: 00:13, 2.45test case/s]
Evaluating 100 test case(s) in parallel: |█▏ | 12% (12/100) [Time Taken: 00:13, 2.29test case/s]
Evaluating 100 test case(s) in parallel: |█▍ | 14% (14/100) [Time Taken: 00:14, 2.96test case/s]
Evaluating 100 test case(s) in parallel: |█▌ | 15% (15/100) [Time Taken: 00:14, 3.42test case/s]
Evaluating 100 test case(s) in parallel: |█▌ | 16% (16/100) [Time Taken: 00:15, 1.76test case/s]
Evaluating 100 test case(s) in parallel: |█▋ | 17% (17/100) [Time Taken: 00:16, 1.85test case/s]
Evaluating 100 test case(s) in parallel: |█▊ | 18% (18/100) [Time Taken: 00:16, 2.21test case/s]
Evaluating 100 test case(s) in parallel: |█▉ | 19% (19/100) [Time Taken: 00:16, 2.61test case/s]
Evaluating 100 test case(s) in parallel: |██ | 20% (20/100) [Time Taken: 00:16, 3.30test case/s]
Evaluating 100 test case(s) in parallel: |██ | 21% (21/100) [Time Taken: 00:16, 3.13test case/s]
Evaluating 100 test case(s) in parallel: |██▏ | 22% (22/100) [Time Taken: 00:17, 2.69test case/s]
Evaluating 100 test case(s) in parallel: |██▍ | 24% (24/100) [Time Taken: 00:17, 3.80test case/s]
Evaluating 100 test case(s) in parallel: |██▌ | 25% (25/100) [Time Taken: 00:17, 3.78test case/s]
Evaluating 100 test case(s) in parallel: |██▌ | 26% (26/100) [Time Taken: 00:18, 3.70test case/s]
Evaluating 100 test case(s) in parallel: |██▊ | 28% (28/100) [Time Taken: 00:18, 5.60test case/s]
Evaluating 100 test case(s) in parallel: |██▉ | 29% (29/100) [Time Taken: 00:18, 6.22test case/s]
Evaluating 100 test case(s) in parallel: |███ | 31% (31/100) [Time Taken: 00:18, 5.49test case/s]
Evaluating 100 test case(s) in parallel: |███▎ | 33% (33/100) [Time Taken: 00:19, 7.42test case/s]
Evaluating 100 test case(s) in parallel: |███▊ | 38% (38/100) [Time Taken: 00:19, 13.78test case/s]
Evaluating 100 test case(s) in parallel: |████▏ | 42% (42/100) [Time Taken: 00:19, 18.12test case/s]
Evaluating 100 test case(s) in parallel: |████▌ | 46% (46/100) [Time Taken: 00:19, 20.18test case/s]
Evaluating 100 test case(s) in parallel: |████▉ | 49% (49/100) [Time Taken: 00:19, 20.84test case/s]
Evaluating 100 test case(s) in parallel: |█████▏ | 52% (52/100) [Time Taken: 00:19, 15.68test case/s]
Evaluating 100 test case(s) in parallel: |█████▌ | 55% (55/100) [Time Taken: 00:19, 17.92test case/s]
Evaluating 100 test case(s) in parallel: |█████▊ | 58% (58/100) [Time Taken: 00:20, 19.82test case/s]
Evaluating 100 test case(s) in parallel: |██████ | 61% (61/100) [Time Taken: 00:20, 12.35test case/s]
Evaluating 100 test case(s) in parallel: |██████▍ | 64% (64/100) [Time Taken: 00:20, 14.41test case/s]
Evaluating 100 test case(s) in parallel: |██████▋ | 67% (67/100) [Time Taken: 00:20, 16.23test case/s]
Evaluating 100 test case(s) in parallel: |███████ | 70% (70/100) [Time Taken: 00:21, 12.40test case/s]
Evaluating 100 test case(s) in parallel: |███████▎ | 73% (73/100) [Time Taken: 00:21, 14.06test case/s]
Evaluating 100 test case(s) in parallel: |███████▌ | 75% (75/100) [Time Taken: 00:21, 14.94test case/s]
Evaluating 100 test case(s) in parallel: |███████▋ | 77% (77/100) [Time Taken: 00:21, 14.58test case/s]
Evaluating 100 test case(s) in parallel: |███████▉ | 79% (79/100) [Time Taken: 00:22, 5.44test case/s]
Evaluating 100 test case(s) in parallel: |████████ | 81% (81/100) [Time Taken: 00:22, 6.72test case/s]
Evaluating 100 test case(s) in parallel: |████████▎ | 83% (83/100) [Time Taken: 00:22, 8.01test case/s]
Evaluating 100 test case(s) in parallel: |████████▌ | 85% (85/100) [Time Taken: 00:23, 8.10test case/s]
Evaluating 100 test case(s) in parallel: |████████▋ | 87% (87/100) [Time Taken: 00:23, 6.20test case/s]
Evaluating 100 test case(s) in parallel: |████████▉ | 89% (89/100) [Time Taken: 00:23, 6.76test case/s]
Evaluating 100 test case(s) in parallel: |█████████ | 91% (91/100) [Time Taken: 00:24, 6.60test case/s]
Evaluating 100 test case(s) in parallel: |█████████▏| 92% (92/100) [Time Taken: 00:24, 6.51test case/s]
Evaluating 100 test case(s) in parallel: |█████████▎| 93% (93/100) [Time Taken: 00:24, 6.23test case/s]
Evaluating 100 test case(s) in parallel: |█████████▍| 94% (94/100) [Time Taken: 00:24, 6.10test case/s]
Evaluating 100 test case(s) in parallel: |█████████▌| 95% (95/100) [Time Taken: 00:25, 2.29test case/s]
Evaluating 100 test case(s) in parallel: |█████████▌| 96% (96/100) [Time Taken: 00:26, 1.92test case/s]
Evaluating 100 test case(s) in parallel: |█████████▋| 97% (97/100) [Time Taken: 00:27, 1.84test case/s]
Evaluating 100 test case(s) in parallel: |█████████▉| 99% (99/100) [Time Taken: 00:29, 1.34test case/s]
Evaluating 100 test case(s) in parallel: |██████████|100% (100/100) [Time Taken: 00:31, 1.02s/test case]
Evaluating 100 test case(s) in parallel: |██████████|100% (100/100) [Time Taken: 00:31, 3.20test case/s]
✓ Tests finished 🎉! Run 'deepeval login' to save and analyze evaluation results
on Confident AI.
‼️ Friendly reminder 😇: You can also run evaluations with ALL of deepeval's
metrics directly on Confident AI instead.
Average Metric Scores:
Contextual Precision 0.711875
Contextual Recall 0.8343571428571428
Hallucination 0.5478373015873016
Metric Passrates:
Contextual Precision 0.69
Contextual Recall 0.76
Hallucination 0.58

Please sign in to comment.