From 5d7475751a238479cd6559a7d5757034ad13d60e Mon Sep 17 00:00:00 2001 From: Vlad Adumitracesei Date: Fri, 7 Jun 2024 19:45:08 +0300 Subject: [PATCH 1/5] Added evaluation with ragas --- course/module-3/insert_data_mongo.py | 12 ++++++------ course/module-5/evaluation/rag.py | 23 ++++++++++++++++++++++- course/module-5/inference_pipeline.py | 7 ++++++- course/module-5/settings.py | 2 +- 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/course/module-3/insert_data_mongo.py b/course/module-3/insert_data_mongo.py index 7c5ed18..3bed867 100644 --- a/course/module-3/insert_data_mongo.py +++ b/course/module-3/insert_data_mongo.py @@ -43,12 +43,12 @@ def download_dataset(output_dir: Path = Path("data")) -> list: "type": "post", "author_id": "2", }, - { - "file_name": "repositories_paul_iusztin.json", - "file_id": "1tSWrlj_u85twAqVus-l0mzqgYVV6WHVz", - "type": "repository", - "author_id": "2", - }, + # { + # "file_name": "repositories_paul_iusztin.json", + # "file_id": "1tSWrlj_u85twAqVus-l0mzqgYVV6WHVz", + # "type": "repository", + # "author_id": "2", + # }, ] for file in files: file["file_path"] = str(output_dir / file["file_name"]) diff --git a/course/module-5/evaluation/rag.py b/course/module-5/evaluation/rag.py index 5e53c86..760ec4a 100644 --- a/course/module-5/evaluation/rag.py +++ b/course/module-5/evaluation/rag.py @@ -1,7 +1,14 @@ import llm_components.prompt_templates as templates -from langchain_openai import ChatOpenAI from llm_components.chain import GeneralChain + +from langchain_openai import ChatOpenAI +from pandas import DataFrame + from settings import settings +from datasets import Dataset + +from ragas import evaluate +from ragas.metrics import context_precision, context_relevancy, context_recall def evaluate(query: str, context: list[str], output: str) -> str: @@ -16,3 +23,17 @@ def evaluate(query: str, context: list[str], output: str) -> str: response = chain.invoke({"query": query, "context": context, "output": output}) return response["rag_eval"] + + +def evaluate_with_ragas(query: str, context: list[str], output: str) -> DataFrame: + + data_sample = { + "question": query, + "answer": output, + "context": context + } + + dataset = Dataset.from_dict(data_sample) + score = evaluate(dataset=dataset, metrics=[context_precision, context_relevancy, context_recall]) + + return score.to_pandas() \ No newline at end of file diff --git a/course/module-5/inference_pipeline.py b/course/module-5/inference_pipeline.py index d010744..c1d0b1d 100644 --- a/course/module-5/inference_pipeline.py +++ b/course/module-5/inference_pipeline.py @@ -1,9 +1,11 @@ import pandas as pd from evaluation import evaluate_llm +from evaluation.rag import evaluate_with_ragas from llm_components.prompt_templates import InferenceTemplate from monitoring import PromptMonitoringManager from qwak_inference import RealTimeClient from rag.retriever import VectorRetriever + from settings import settings @@ -45,7 +47,10 @@ def generate( answer = response[0]["content"][0] if enable_evaluation is True: - evaluation_result = evaluate_llm(query=query, output=answer) + evaluation_result = { + 'llm_evaluation': evaluate_llm(query=query, output=answer), + 'rag_evaluation': evaluate_with_ragas(query=query, output=answer, context=context) if enable_rag else None + } else: evaluation_result = None diff --git a/course/module-5/settings.py b/course/module-5/settings.py index de3e5da..7da63dc 100644 --- a/course/module-5/settings.py +++ b/course/module-5/settings.py @@ -20,7 +20,7 @@ class AppSettings(BaseSettings): QDRANT_DATABASE_URL: str = "http://localhost:6333" QDRANT_CLOUD_URL: str = "str" - USE_QDRANT_CLOUD: bool = True + USE_QDRANT_CLOUD: bool = False QDRANT_APIKEY: str | None = None # MQ config From db3a36269ad6c3322c001bd5fcface35ae596483 Mon Sep 17 00:00:00 2001 From: razvantalex Date: Sat, 8 Jun 2024 16:15:49 +0300 Subject: [PATCH 2/5] fix: Added ragas@0.1.9 as req --- course/module-5/pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/course/module-5/pyproject.toml b/course/module-5/pyproject.toml index d1a919d..f455984 100644 --- a/course/module-5/pyproject.toml +++ b/course/module-5/pyproject.toml @@ -7,7 +7,6 @@ authors = [ "Paul Iusztin ", "Alex Vesa ", ] -package-mode = false readme = "README.md" @@ -39,7 +38,7 @@ datasets = "^2.19.1" peft = "^0.11.1" bitsandbytes = "^0.43.1" qwak-inference = "^0.1.17" - +ragas= "^0.1.9" [build-system] requires = ["poetry-core"] From c2d77b9963a26b158349ee292c1ae3056e0e7acf Mon Sep 17 00:00:00 2001 From: razvantalex Date: Sat, 8 Jun 2024 20:56:15 +0300 Subject: [PATCH 3/5] feat: Added ragas eval + comet-llm chain logging w metadata and timings workaround --- course/module-5/evaluation/llm.py | 1 - course/module-5/evaluation/rag.py | 68 +++++++++++++++---- course/module-5/inference_pipeline.py | 47 +++++++++---- course/module-5/main.py | 6 +- .../module-5/monitoring/prompt_monitoring.py | 59 ++++++++++++++-- 5 files changed, 143 insertions(+), 38 deletions(-) diff --git a/course/module-5/evaluation/llm.py b/course/module-5/evaluation/llm.py index d4ca16a..3524cbe 100644 --- a/course/module-5/evaluation/llm.py +++ b/course/module-5/evaluation/llm.py @@ -1,6 +1,5 @@ from langchain_openai import ChatOpenAI from llm_components.chain import GeneralChain -from llm_components.chain import GeneralChain from llm_components.prompt_templates import LLMEvaluationTemplate from settings import settings diff --git a/course/module-5/evaluation/rag.py b/course/module-5/evaluation/rag.py index 760ec4a..4e3846d 100644 --- a/course/module-5/evaluation/rag.py +++ b/course/module-5/evaluation/rag.py @@ -1,21 +1,46 @@ import llm_components.prompt_templates as templates -from llm_components.chain import GeneralChain - +from datasets import Dataset from langchain_openai import ChatOpenAI +from llm_components.chain import GeneralChain from pandas import DataFrame - +from ragas import evaluate +from ragas.embeddings import HuggingfaceEmbeddings +from ragas.metrics import ( + answer_correctness, + answer_similarity, + context_entity_recall, + context_recall, + context_relevancy, + context_utilization, +) from settings import settings -from datasets import Dataset -from ragas import evaluate -from ragas.metrics import context_precision, context_relevancy, context_recall +# Evaluating against the following metrics +# RETRIEVAL BASED +# 1. Context Utilization - How well the context is utilized +# 2. Context Relevancy - (VDB based) measures the relevance of retrieved context +# 3. Context Recall - How well the context is recalled in the answer +# 4. Context Entity Recall - a measure of what fraction of entities are recalled from ground_truths + +# END-TO-END +# 5. Answer Similarity - measures the semantic resemblance between the answer and gt answer +# 6. Answer Corectness - measures the correctness of the answer compared to gt + +METRICS = [ + context_utilization, + context_relevancy, + context_recall, + answer_similarity, + context_entity_recall, + answer_correctness, +] -def evaluate(query: str, context: list[str], output: str) -> str: +def evaluate_w_template(query: str, context: list[str], output: str) -> str: evaluation_template = templates.RAGEvaluationTemplate() prompt_template = evaluation_template.create_template() - model = ChatOpenAI(model=settings.OPENAI_MODEL_ID) + model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, api_key=settings.OPENAI_API_KEY) chain = GeneralChain.get_chain( llm=model, output_key="rag_eval", template=prompt_template ) @@ -25,15 +50,28 @@ def evaluate(query: str, context: list[str], output: str) -> str: return response["rag_eval"] -def evaluate_with_ragas(query: str, context: list[str], output: str) -> DataFrame: - +def evaluate_w_ragas(query: str, context: list[str], output: str) -> DataFrame: + """ + Evaluate the RAG (query,context,response) using RAGAS + """ data_sample = { - "question": query, - "answer": output, - "context": context + "question": [query], # Question as Sequence(str) + "answer": [output], # Answer as Sequence(str) + "contexts": [context], # Context as Sequence(str) + "ground_truth": ["".join(context)], # Ground Truth as Sequence(str) } + oai_model = ChatOpenAI( + model=settings.OPENAI_MODEL_ID, + api_key=settings.OPENAI_API_KEY, + ) + embd_model = HuggingfaceEmbeddings(model=settings.EMBEDDING_MODEL_ID) dataset = Dataset.from_dict(data_sample) - score = evaluate(dataset=dataset, metrics=[context_precision, context_relevancy, context_recall]) + score = evaluate( + llm=oai_model, + embeddings=embd_model, + dataset=dataset, + metrics=METRICS, + ) - return score.to_pandas() \ No newline at end of file + return score diff --git a/course/module-5/inference_pipeline.py b/course/module-5/inference_pipeline.py index c1d0b1d..ce4a88a 100644 --- a/course/module-5/inference_pipeline.py +++ b/course/module-5/inference_pipeline.py @@ -1,11 +1,12 @@ +import time + import pandas as pd from evaluation import evaluate_llm -from evaluation.rag import evaluate_with_ragas +from evaluation.rag import evaluate_w_ragas from llm_components.prompt_templates import InferenceTemplate from monitoring import PromptMonitoringManager from qwak_inference import RealTimeClient from rag.retriever import VectorRetriever - from settings import settings @@ -16,6 +17,12 @@ def __init__(self) -> None: ) self.template = InferenceTemplate() self.prompt_monitoring_manager = PromptMonitoringManager() + self._timings = { + "retrieval": 0.0, + "generation": 0.0, + "evaluation_rag": 0.0, + "evaluation_llm": 0.0, + } def generate( self, @@ -30,6 +37,7 @@ def generate( } if enable_rag is True: + st_time = time.time_ns() retriever = VectorRetriever(query=query) hits = retriever.retrieve_top_k( k=settings.TOP_K, to_expand_to_n_queries=settings.EXPAND_N_QUERY @@ -38,37 +46,52 @@ def generate( prompt_template_variables["context"] = context prompt = prompt_template.format(question=query, context=context) + en_time = time.time_ns() + self._timings["retrieval"] = (en_time - st_time) / 1e10 else: prompt = prompt_template.format(question=query) + st_time = time.time_ns() input_ = pd.DataFrame([{"instruction": prompt}]).to_json() response: list[dict] = self.qwak_client.predict(input_) - answer = response[0]["content"][0] + answer = response[0]["content"] + en_time = time.time_ns() + self._timings["generation"] = (en_time - st_time) / 1e10 if enable_evaluation is True: + if enable_rag: + st_time = time.time_ns() + rag_eval_scores = evaluate_w_ragas( + query=query, output=answer, context=context + ) + en_time = time.time_ns() + self._timings["evaluation_rag"] = (en_time - st_time) / 1e10 + st_time = time.time_ns() + llm_eval = evaluate_llm(query=query, output=answer) + en_time = time.time_ns() + self._timings["evaluation_llm"] = (en_time - st_time) / 1e10 evaluation_result = { - 'llm_evaluation': evaluate_llm(query=query, output=answer), - 'rag_evaluation': evaluate_with_ragas(query=query, output=answer, context=context) if enable_rag else None + "llm_evaluation": "" if not llm_eval else llm_eval, + "rag_evaluation": {} if not rag_eval_scores else rag_eval_scores, } else: evaluation_result = None if enable_monitoring is True: - if evaluation_result is not None: - metadata = {"llm_evaluation_result": evaluation_result} - else: - metadata = None - self.prompt_monitoring_manager.log( prompt=prompt, prompt_template=prompt_template.template, prompt_template_variables=prompt_template_variables, output=answer, - metadata=metadata, ) self.prompt_monitoring_manager.log_chain( - query=query, response=answer, eval_output=evaluation_result + query=query, + context=context, + llm_gen=answer, + llm_eval_output=evaluation_result["llm_evaluation"], + rag_eval_scores=evaluation_result["rag_evaluation"], + timings=self._timings, ) return {"answer": answer, "llm_evaluation_result": evaluation_result} diff --git a/course/module-5/main.py b/course/module-5/main.py index e20d95d..ead7ca6 100644 --- a/course/module-5/main.py +++ b/course/module-5/main.py @@ -10,14 +10,14 @@ query = """ Hello my author_id is 1. - Could you please draft a LinkedIn post discussing Vector Databases? - I'm particularly interested in how do they work. + Could you please draft a LinkedIn post discussing Feature Stores? + I'm particularly interested in their importance and how they can be used in ML systems. """ response = inference_endpoint.generate( query=query, enable_rag=True, - enable_evaluation=False, + enable_evaluation=True, enable_monitoring=True, ) diff --git a/course/module-5/monitoring/prompt_monitoring.py b/course/module-5/monitoring/prompt_monitoring.py index 83345cb..4995fd8 100644 --- a/course/module-5/monitoring/prompt_monitoring.py +++ b/course/module-5/monitoring/prompt_monitoring.py @@ -1,3 +1,6 @@ +import time +from typing import List + import comet_llm from settings import settings @@ -32,7 +35,19 @@ def log( ) @classmethod - def log_chain(cls, query: str, response: str, eval_output: str): + def log_chain( + cls, + query: str, + context: List[str], + llm_gen: str, + llm_eval_output: str, + rag_eval_scores: dict | None = None, + timings: dict | None = None, + ) -> None: + """Important!! + Workaround to get timings/chain, is to time.sleep(timing) for each step! + To be removed in production code. + """ comet_llm.init(project=f"{settings.COMET_PROJECT}-monitoring") comet_llm.start_chain( inputs={"user_query": query}, @@ -41,14 +56,44 @@ def log_chain(cls, query: str, response: str, eval_output: str): workspace=settings.COMET_WORKSPACE, ) with comet_llm.Span( - category="twin_response", + category="Vector Retrieval", + name="retrieval_step", + inputs={"user_query": query}, + ) as span: + time.sleep(timings.get("retrieval")) + span.set_outputs(outputs={"retrieved_context": context}) + + with comet_llm.Span( + category="LLM Generation", + name="generation_step", inputs={"user_query": query}, ) as span: - span.set_outputs(outputs=response) + time.sleep(timings.get("generation")) + span.set_outputs(outputs={"generation": llm_gen}) + + with comet_llm.Span( + category="Evaluation", + name="llm_eval_step", + inputs={"query": llm_gen, "user_query": query}, + metadata={"model_used": settings.OPENAI_MODEL_ID}, + ) as span: + time.sleep(timings.get("evaluation_llm")) + span.set_outputs(outputs={"llm_eval_result": llm_eval_output}) with comet_llm.Span( - category="gpt3.5-eval", - inputs={"eval_result": eval_output}, + category="Evaluation", + name="rag_eval_step", + inputs={ + "user_query": query, + "retrieved_context": context, + "llm_gen": llm_gen, + }, + metadata={ + "model_used": settings.OPENAI_MODEL_ID, + "embd_model": settings.EMBEDDING_MODEL_ID, + "eval_framework": "RAGAS", + }, ) as span: - span.set_outputs(outputs=response) - comet_llm.end_chain(outputs={"response": response, "eval_output": eval_output}) + time.sleep(timings.get("evaluation_rag")) + span.set_outputs(outputs={"rag_eval_scores": rag_eval_scores}) + comet_llm.end_chain(outputs={"response": llm_gen}) From df040d038ad6073ace6273fc89982ef0eef28db1 Mon Sep 17 00:00:00 2001 From: razvantalex Date: Sun, 9 Jun 2024 11:53:55 +0300 Subject: [PATCH 4/5] fix: Add timings as metadata fields --- course/module-5/monitoring/prompt_monitoring.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/course/module-5/monitoring/prompt_monitoring.py b/course/module-5/monitoring/prompt_monitoring.py index 4995fd8..94b5578 100644 --- a/course/module-5/monitoring/prompt_monitoring.py +++ b/course/module-5/monitoring/prompt_monitoring.py @@ -1,4 +1,3 @@ -import time from typing import List import comet_llm @@ -59,25 +58,30 @@ def log_chain( category="Vector Retrieval", name="retrieval_step", inputs={"user_query": query}, + metadata={"duration": timings.get("retrieval")}, ) as span: - time.sleep(timings.get("retrieval")) span.set_outputs(outputs={"retrieved_context": context}) with comet_llm.Span( category="LLM Generation", name="generation_step", inputs={"user_query": query}, + metadata={ + "model_used": settings.OPENAI_MODEL_ID, + "duration": timings.get("generation"), + }, ) as span: - time.sleep(timings.get("generation")) span.set_outputs(outputs={"generation": llm_gen}) with comet_llm.Span( category="Evaluation", name="llm_eval_step", inputs={"query": llm_gen, "user_query": query}, - metadata={"model_used": settings.OPENAI_MODEL_ID}, + metadata={ + "model_used": settings.OPENAI_MODEL_ID, + "duration": timings.get("evaluation_llm"), + }, ) as span: - time.sleep(timings.get("evaluation_llm")) span.set_outputs(outputs={"llm_eval_result": llm_eval_output}) with comet_llm.Span( @@ -92,8 +96,8 @@ def log_chain( "model_used": settings.OPENAI_MODEL_ID, "embd_model": settings.EMBEDDING_MODEL_ID, "eval_framework": "RAGAS", + "duration": timings.get("evaluation_rag"), }, ) as span: - time.sleep(timings.get("evaluation_rag")) span.set_outputs(outputs={"rag_eval_scores": rag_eval_scores}) comet_llm.end_chain(outputs={"response": llm_gen}) From 014a95e58640c159582fd00b41211dfbd871e7ee Mon Sep 17 00:00:00 2001 From: razvantalex Date: Sun, 9 Jun 2024 11:54:55 +0300 Subject: [PATCH 5/5] fix: Changed to correct ns timings 1e9 --- course/module-5/inference_pipeline.py | 8 ++++---- course/module-5/monitoring/prompt_monitoring.py | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/course/module-5/inference_pipeline.py b/course/module-5/inference_pipeline.py index ce4a88a..426d910 100644 --- a/course/module-5/inference_pipeline.py +++ b/course/module-5/inference_pipeline.py @@ -47,7 +47,7 @@ def generate( prompt = prompt_template.format(question=query, context=context) en_time = time.time_ns() - self._timings["retrieval"] = (en_time - st_time) / 1e10 + self._timings["retrieval"] = (en_time - st_time) / 1e9 else: prompt = prompt_template.format(question=query) @@ -57,7 +57,7 @@ def generate( response: list[dict] = self.qwak_client.predict(input_) answer = response[0]["content"] en_time = time.time_ns() - self._timings["generation"] = (en_time - st_time) / 1e10 + self._timings["generation"] = (en_time - st_time) / 1e9 if enable_evaluation is True: if enable_rag: @@ -66,11 +66,11 @@ def generate( query=query, output=answer, context=context ) en_time = time.time_ns() - self._timings["evaluation_rag"] = (en_time - st_time) / 1e10 + self._timings["evaluation_rag"] = (en_time - st_time) / 1e9 st_time = time.time_ns() llm_eval = evaluate_llm(query=query, output=answer) en_time = time.time_ns() - self._timings["evaluation_llm"] = (en_time - st_time) / 1e10 + self._timings["evaluation_llm"] = (en_time - st_time) / 1e9 evaluation_result = { "llm_evaluation": "" if not llm_eval else llm_eval, "rag_evaluation": {} if not rag_eval_scores else rag_eval_scores, diff --git a/course/module-5/monitoring/prompt_monitoring.py b/course/module-5/monitoring/prompt_monitoring.py index 94b5578..cb88da3 100644 --- a/course/module-5/monitoring/prompt_monitoring.py +++ b/course/module-5/monitoring/prompt_monitoring.py @@ -43,10 +43,6 @@ def log_chain( rag_eval_scores: dict | None = None, timings: dict | None = None, ) -> None: - """Important!! - Workaround to get timings/chain, is to time.sleep(timing) for each step! - To be removed in production code. - """ comet_llm.init(project=f"{settings.COMET_PROJECT}-monitoring") comet_llm.start_chain( inputs={"user_query": query},