From 07f707c5a236a5337356907302e024aa6a804b1b Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 18 Oct 2024 03:16:11 +0000 Subject: [PATCH 01/12] small fix for ragas.py Signed-off-by: aasavari --- evals/metrics/ragas/ragas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index c80ff94e..a2598773 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -123,7 +123,7 @@ def measure(self, test_case: Dict): else: if metric == "AnswerRelevancy" and self.embeddings is None: raise ValueError("AnswerRelevancy metric need provide embeddings model.") - tmp_metrics.append(self.metrics_instance[metric]) + tmp_metrics.append(self.metrics_instances[metric]) self.metrics = tmp_metrics else: self.metrics = list(self.metric_instances.values()) From 2de5335a78fa2232c6d8016faabb41037f2688b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 03:14:51 +0000 Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/benchmark/benchmark.py | 2 +- evals/benchmark/utils.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/evals/benchmark/benchmark.py b/evals/benchmark/benchmark.py index 8ce76c83..ccb73a3c 100644 --- a/evals/benchmark/benchmark.py +++ b/evals/benchmark/benchmark.py @@ -248,7 +248,7 @@ def run_service_test(example, service_type, service, test_suite_config): deployment_type, test_suite_config.get("service_ip"), test_suite_config.get("service_port"), - test_suite_config.get("namespace") + test_suite_config.get("namespace"), ) base_url = f"http://{svc_ip}:{port}" diff --git a/evals/benchmark/utils.py b/evals/benchmark/utils.py index 2256f0c3..d66212a0 100644 --- a/evals/benchmark/utils.py +++ b/evals/benchmark/utils.py @@ -29,8 +29,10 @@ def write_json(data, filename): logging.error(f"Failed to write {filename}: {e}") return False + from kubernetes import client, config + def get_service_cluster_ip(service_name, namespace="default"): # Load the Kubernetes configuration config.load_kube_config() # or use config.load_incluster_config() if running inside a Kubernetes pod From 236a2cacac9d31e302ed72b4d21a6e34dd39c009 Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 25 Oct 2024 01:43:57 +0000 Subject: [PATCH 03/12] fixed error when metrics arg is used Signed-off-by: aasavari --- evals/metrics/ragas/ragas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index a2598773..c31cb632 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -123,7 +123,7 @@ def measure(self, test_case: Dict): else: if metric == "AnswerRelevancy" and self.embeddings is None: raise ValueError("AnswerRelevancy metric need provide embeddings model.") - tmp_metrics.append(self.metrics_instances[metric]) + tmp_metrics.append(self.metric_instances[metric]) self.metrics = tmp_metrics else: self.metrics = list(self.metric_instances.values()) From a07392ddc1e49782231d8ecf345cbd0610671754 Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 25 Oct 2024 04:06:57 +0000 Subject: [PATCH 04/12] updated README Signed-off-by: aasavari --- evals/metrics/ragaaf/README.md | 101 +++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 42 deletions(-) diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md index ac8d3b85..259e9458 100644 --- a/evals/metrics/ragaaf/README.md +++ b/evals/metrics/ragaaf/README.md @@ -1,61 +1,65 @@ # RAGAAF (RAG assessment - Annotation Free) -We introduce - RAGAAF, Intel's easy-to-use, flexible, opensource and annotation-free RAG evaluation tool using LLM-as-a-judge while benefitting from Intel's Gaudi2 AI accelator chips. +We introduce - RAGAAF, Intel's easy-to-use, flexible, opensource and annotation-free RAG evaluation tool employing LLM-as-a-judge technique on Intel's Gaudi2 AI accelator chips. RAGAAF is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and effectiveness of the answer using LLM's intelligence. -## Overview -### Data -RAGAAF is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and factualness of the answer via LLM's intelligence. Here, you can use benchmarking datasets or bring your own custom datasets. Please make sure to set `field_map` to map AutoEval fields such as "question" to your dataset's corresponding field like "query". -> Note : To use benchmarking datasets, set argument `data_mode=benchmarking`. Similarly, to use custom datasets, set `data_mode=local`. -### Model -AutoEval can run in 3 evaluation modes - -1. `evaluation_mode="endpoint"` uses HuggingFace endpoint. -- We recommend launching a HuggingFace endpoint on Gaudi AI accelerator machines to ensure maximum usage and performance. -- To launch HF endpoint on Gaudi2, please follow the 2-step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). -- Pass your endpoint url as `model_name` argument. -2. `evaluation_mode="openai"` uses openai backend. -- Please set your `openai_key` and your choice of model as `model_name` argument. -3. `evaluation_mode="local"` uses your local hardware. -- Set `hf_token` argument and set your favourite open-source model in `model_name` argument. -- GPU usage will be prioritized after checking it's availability. If GPU is unavailable, the model will run on CPU. -## Metrics -AutoEval provides 4 metrics - factualness, correctness, relevance and readability. You can also bring your own metrics and grading scales. Don't forget to add your metric to `evaluation_metrics` argument. -## Generation configuration -We provide recommended generation parameters after experimenting with different LLMs. If you'd like to edit them to your requirement, please set generation parameters in `GENERATION_CONFIG` in `run_eval.py`. - -## Run using HF endpoint -```python3 -# step 1 : choose your dataset -- local or benchmarking -dataset = "explodinggradients/ragas-wikiqa" -data_mode = "benchmarking" -field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} - -# step 2 - choose your favourite LLM and hardware -# evaluation_mode = "openai" -# model_name = "gpt-4o" -# openai_key = "" +## Run RAGAAF -# evaluation_mode = "endpoint" -# model_name = f"http://{host_ip}:{port}" +### Data +We provide 3 modes for data loading - `benchmarking`, `unit` and `local` to support benchmarking datasets, unit test cases and your custom datasets. -evaluation_mode = "local" -model_name = "meta-llama/Llama-3.2-1B-Instruct" -hf_token = "" +Let us see how to load a unit test case. +```python3 +# load your dataset +dataset = "unit_data" # name of the dataset +data_mode = "unit" # mode for data loading +field_map = { + "question": "question", + "answer": "actual_output", + "context": "contexts" + } # map your data field such as "actual_output" to RAGAAF field "answer" -# step 3 - choose metrics of your choice, you can also add custom metrics +# your desired unit test case +question = "What if these shoes don't fit?" +actual_output = "We offer a 30-day full refund at no extra cost." +contexts = [ + "All customers are eligible for a 30 day full refund at no extra cost.", + "We can only process full refund upto 30 day after the purchase.", +] +examples = [{"question": question, "actual_output": actual_output, "contexts": contexts}] +``` +### Launch endpoint on Gaudi +Please launch an endpoint on Gaudi2 using the most popular LLMs such as `mistralai/Mixtral-8x7B-Instruct-v0.1` by following the 2 step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). +### Model +We provide 3 evaluation modes - `endpoint`, `local` (supports CPU and GPU), `openai`. +```python3 +# choose your favourite LLM and hardware +host_ip = os.getenv("host_ip", "localhost") +port = os.getenv("port", "") +evaluation_mode = "endpoint" +model_name = f"http://{host_ip}:{port}" +``` +> `local` evaluation mode uses your local hardware (GPU usage is prioritized over CPU when available). Don't forget to set `hf_token` argument and your favourite open-source model in `model_name` argument. +> `openai` evaluation mode uses openai backend. Please set your `openai_key` as argument and your choice of OpenAI model as `model_name` argument. +### Metrics +```python3 +# choose metrics of your choice, you can also add custom metrics evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] +``` +### Evaluation +```python3 +from evals.metrics.ragaaf import AnnotationFreeEvaluate -# step 4 - run evaluation evaluator = AnnotationFreeEvaluate( dataset=dataset, + examples=examples, data_mode=data_mode, field_map=field_map, evaluation_mode=evaluation_mode, model_name=model_name, evaluation_metrics=evaluation_metrics, # openai_key=openai_key, - hf_token=hf_token, - debug_mode=True, + # hf_token=hf_token, ) responses = evaluator.measure() @@ -63,4 +67,17 @@ responses = evaluator.measure() for response in responses: print(response) ``` -That's it! For troubleshooting, please submit an issue and we will get right on it. +## Customizations +1. If you'd like to change generation parameters, please see in `GENERATION_CONFIG` in `run_eval.py`. +2. If you'd like to add a new metric, please mimic an existing metric, e.g., `./prompt_templates/correctness.py` +```python3 +class MetricName: + name = "metric_name" + required_columns = ["answer", "context", "question"] # the fields your metric needs + template = """- : measures . + - Score 1: . + - Score 2: . + - Score 3: . + - Score 4: . + - Score 5: .""" +``` \ No newline at end of file From 2c06ea64ee1e2cf401c508dc871ec0b93988e2dd Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 25 Oct 2024 05:06:55 +0000 Subject: [PATCH 05/12] added key features Signed-off-by: aasavari --- evals/metrics/ragaaf/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md index 259e9458..a913bd5e 100644 --- a/evals/metrics/ragaaf/README.md +++ b/evals/metrics/ragaaf/README.md @@ -1,7 +1,13 @@ # RAGAAF (RAG assessment - Annotation Free) -We introduce - RAGAAF, Intel's easy-to-use, flexible, opensource and annotation-free RAG evaluation tool employing LLM-as-a-judge technique on Intel's Gaudi2 AI accelator chips. RAGAAF is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and effectiveness of the answer using LLM's intelligence. +Intel's RAGAAF toolkit employs opensource LLM-as-a-judge technique on Intel's Gaudi2 AI accelator chips to perform annotation-free evaluation of RAG. +## Key features +✨ Annotation Free evaluation (ground truth answers are not required). +🧠 Provides score and reasoning for each metric allowing a deep dive into LLM's throught process. +🤗 Quick access to latest innovations in opensource Large Language Models. +⏩ Seamlessly boost performance using Intel's powerful AI accelerator chips - Gaudi. +✍️ Flexibility to bring your own metrics, grading rubrics and datasets. ## Run RAGAAF From c0937d4f3851ed11fe90a20cf7b2a88e782004fd Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 25 Oct 2024 05:08:09 +0000 Subject: [PATCH 06/12] edited formatting Signed-off-by: aasavari --- evals/metrics/ragaaf/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md index a913bd5e..c748fc5c 100644 --- a/evals/metrics/ragaaf/README.md +++ b/evals/metrics/ragaaf/README.md @@ -3,10 +3,10 @@ Intel's RAGAAF toolkit employs opensource LLM-as-a-judge technique on Intel's Gaudi2 AI accelator chips to perform annotation-free evaluation of RAG. ## Key features -✨ Annotation Free evaluation (ground truth answers are not required). -🧠 Provides score and reasoning for each metric allowing a deep dive into LLM's throught process. -🤗 Quick access to latest innovations in opensource Large Language Models. -⏩ Seamlessly boost performance using Intel's powerful AI accelerator chips - Gaudi. +✨ Annotation Free evaluation (ground truth answers are not required).
+🧠 Provides score and reasoning for each metric allowing a deep dive into LLM's throught process.
+🤗 Quick access to latest innovations in opensource Large Language Models.
+⏩ Seamlessly boost performance using Intel's powerful AI accelerator chips - Gaudi.
✍️ Flexibility to bring your own metrics, grading rubrics and datasets. ## Run RAGAAF From c6370029ab9762b2fee95d21e345dd19d71cdaf3 Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 25 Oct 2024 05:10:36 +0000 Subject: [PATCH 07/12] improved readability Signed-off-by: aasavari --- evals/metrics/ragaaf/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md index c748fc5c..4ad0c042 100644 --- a/evals/metrics/ragaaf/README.md +++ b/evals/metrics/ragaaf/README.md @@ -11,7 +11,7 @@ Intel's RAGAAF toolkit employs opensource LLM-as-a-judge technique on Intel's Ga ## Run RAGAAF -### Data +### 1. Data We provide 3 modes for data loading - `benchmarking`, `unit` and `local` to support benchmarking datasets, unit test cases and your custom datasets. Let us see how to load a unit test case. @@ -34,9 +34,9 @@ contexts = [ ] examples = [{"question": question, "actual_output": actual_output, "contexts": contexts}] ``` -### Launch endpoint on Gaudi +### 2. Launch endpoint on Gaudi Please launch an endpoint on Gaudi2 using the most popular LLMs such as `mistralai/Mixtral-8x7B-Instruct-v0.1` by following the 2 step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). -### Model +### 3. Model We provide 3 evaluation modes - `endpoint`, `local` (supports CPU and GPU), `openai`. ```python3 # choose your favourite LLM and hardware @@ -47,12 +47,12 @@ model_name = f"http://{host_ip}:{port}" ``` > `local` evaluation mode uses your local hardware (GPU usage is prioritized over CPU when available). Don't forget to set `hf_token` argument and your favourite open-source model in `model_name` argument. > `openai` evaluation mode uses openai backend. Please set your `openai_key` as argument and your choice of OpenAI model as `model_name` argument. -### Metrics +### 4. Metrics ```python3 # choose metrics of your choice, you can also add custom metrics evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] ``` -### Evaluation +### 5. Evaluation ```python3 from evals.metrics.ragaaf import AnnotationFreeEvaluate From 739f75a7c0ff45e85d0c660ccf45c47877794fd5 Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 25 Oct 2024 05:13:10 +0000 Subject: [PATCH 08/12] improved note in model section Signed-off-by: aasavari --- evals/metrics/ragaaf/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md index 4ad0c042..71dab125 100644 --- a/evals/metrics/ragaaf/README.md +++ b/evals/metrics/ragaaf/README.md @@ -45,7 +45,7 @@ port = os.getenv("port", "") evaluation_mode = "endpoint" model_name = f"http://{host_ip}:{port}" ``` -> `local` evaluation mode uses your local hardware (GPU usage is prioritized over CPU when available). Don't forget to set `hf_token` argument and your favourite open-source model in `model_name` argument. +> `local` evaluation mode uses your local hardware (GPU usage is prioritized over CPU when available). Don't forget to set `hf_token` argument and your favourite open-source model in `model_name` argument.
> `openai` evaluation mode uses openai backend. Please set your `openai_key` as argument and your choice of OpenAI model as `model_name` argument. ### 4. Metrics ```python3 From a52a39c24ce858c699252d1939b14ac91d806298 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Oct 2024 20:39:36 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/metrics/ragaaf/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md index 71dab125..3aee2b8f 100644 --- a/evals/metrics/ragaaf/README.md +++ b/evals/metrics/ragaaf/README.md @@ -17,13 +17,13 @@ We provide 3 modes for data loading - `benchmarking`, `unit` and `local` to supp Let us see how to load a unit test case. ```python3 # load your dataset -dataset = "unit_data" # name of the dataset -data_mode = "unit" # mode for data loading +dataset = "unit_data" # name of the dataset +data_mode = "unit" # mode for data loading field_map = { - "question": "question", - "answer": "actual_output", - "context": "contexts" - } # map your data field such as "actual_output" to RAGAAF field "answer" + "question": "question", + "answer": "actual_output", + "context": "contexts", +} # map your data field such as "actual_output" to RAGAAF field "answer" # your desired unit test case question = "What if these shoes don't fit?" @@ -43,13 +43,13 @@ We provide 3 evaluation modes - `endpoint`, `local` (supports CPU and GPU), `ope host_ip = os.getenv("host_ip", "localhost") port = os.getenv("port", "") evaluation_mode = "endpoint" -model_name = f"http://{host_ip}:{port}" +model_name = f"http://{host_ip}:{port}" ``` > `local` evaluation mode uses your local hardware (GPU usage is prioritized over CPU when available). Don't forget to set `hf_token` argument and your favourite open-source model in `model_name` argument.
> `openai` evaluation mode uses openai backend. Please set your `openai_key` as argument and your choice of OpenAI model as `model_name` argument. ### 4. Metrics ```python3 -# choose metrics of your choice, you can also add custom metrics +# choose metrics of your choice, you can also add custom metrics evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] ``` ### 5. Evaluation @@ -79,11 +79,11 @@ for response in responses: ```python3 class MetricName: name = "metric_name" - required_columns = ["answer", "context", "question"] # the fields your metric needs + required_columns = ["answer", "context", "question"] # the fields your metric needs template = """- : measures . - Score 1: . - Score 2: . - Score 3: . - Score 4: . - Score 5: .""" -``` \ No newline at end of file +``` From 05dd8d9553a03272cd5ce53d415da62ac420110d Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 25 Oct 2024 21:09:13 +0000 Subject: [PATCH 10/12] spell check Signed-off-by: aasavari --- evals/metrics/ragaaf/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md index 3aee2b8f..9c498236 100644 --- a/evals/metrics/ragaaf/README.md +++ b/evals/metrics/ragaaf/README.md @@ -4,7 +4,7 @@ Intel's RAGAAF toolkit employs opensource LLM-as-a-judge technique on Intel's Ga ## Key features ✨ Annotation Free evaluation (ground truth answers are not required).
-🧠 Provides score and reasoning for each metric allowing a deep dive into LLM's throught process.
+🧠 Provides score and reasoning for each metric allowing a deep dive into LLM's thought process.
🤗 Quick access to latest innovations in opensource Large Language Models.
⏩ Seamlessly boost performance using Intel's powerful AI accelerator chips - Gaudi.
✍️ Flexibility to bring your own metrics, grading rubrics and datasets. From 30b3c68119fb0429e8a97b044f10f18c6100a62a Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 25 Oct 2024 22:15:24 +0000 Subject: [PATCH 11/12] adding context relevance metric to RAGAAF Signed-off-by: aasavari --- .../ragaaf/prompt_templates/context_relevance.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 evals/metrics/ragaaf/prompt_templates/context_relevance.py diff --git a/evals/metrics/ragaaf/prompt_templates/context_relevance.py b/evals/metrics/ragaaf/prompt_templates/context_relevance.py new file mode 100644 index 00000000..78cec232 --- /dev/null +++ b/evals/metrics/ragaaf/prompt_templates/context_relevance.py @@ -0,0 +1,12 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +class ContextRelevance: + name = "context_relevance" + required_columns = ["question", "context"] + template = """- Context Relevance: Context Relevance measures how well the context relates to the question. + - Score 1: The context doesn't mention anything about the question or is completely irrelevant to the question. + - Score 2: The context only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the context does not address the question itself and the point of the question is completely missed by it. + - Score 3: The context correctly identifies the domain and essence of the question but the details in the context are not relevant to the focus of the question. + - Score 4: The context correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the context that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the context. + - Score 5: The context is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the context that are irrelevant or unnecessary for the given question.""" From efab2c0c07db73d23cf1ef3c0ed624aed660f436 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 21:49:54 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/metrics/ragaaf/prompt_templates/context_relevance.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evals/metrics/ragaaf/prompt_templates/context_relevance.py b/evals/metrics/ragaaf/prompt_templates/context_relevance.py index 78cec232..4cb2976b 100644 --- a/evals/metrics/ragaaf/prompt_templates/context_relevance.py +++ b/evals/metrics/ragaaf/prompt_templates/context_relevance.py @@ -1,6 +1,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class ContextRelevance: name = "context_relevance" required_columns = ["question", "context"]