From 169926274b32262bc5500ea201e724ecff57a9ec Mon Sep 17 00:00:00 2001 From: Xinyao Wang Date: Mon, 28 Oct 2024 13:28:48 +0800 Subject: [PATCH 1/3] add longbench Signed-off-by: Xinyao Wang --- evals/evaluation/longbench/README.md | 66 +++++++++++++++++ evals/evaluation/longbench/pred.py | 105 +++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 evals/evaluation/longbench/README.md create mode 100644 evals/evaluation/longbench/pred.py diff --git a/evals/evaluation/longbench/README.md b/evals/evaluation/longbench/README.md new file mode 100644 index 00000000..3b7442f3 --- /dev/null +++ b/evals/evaluation/longbench/README.md @@ -0,0 +1,66 @@ +[LongBench](https://github.com/THUDM/LongBench) is the benchmark for bilingual, multitask, and comprehensive assessment of long context understanding capabilities of large language models. LongBench includes different languages (Chinese and English) to provide a more comprehensive evaluation of the large models' multilingual capabilities on long contexts. In addition, LongBench is composed of six major categories and twenty one different tasks, covering key long-text application scenarios such as single-document QA, multi-document QA, summarization, few-shot learning, synthetic tasks and code completion. + +In this guideline, we evalute LongBench dataset with OPEA services on Intel hardwares. + +# 🚀 QuickStart + +## Installation + +``` +pip install ../../../requirements.txt +``` + +## Launch a LLM Service + +To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-gaudi) or [OPEA microservices](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation) to launch a service. + +### Example 1: TGI +For example, the follow command is to setup the [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) model on Gaudi: + +``` +model=meta-llama/Llama-2-7b-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all \ +-e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN=$hf_token \ +-e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true \ +-e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host \ +ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model --max-input-tokens 1024 \ +--max-total-tokens 2048 +``` + +### Example 2: OPEA LLM +You can also set up a service with OPEA microservices. + +For example, you can refer to [native LLM](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/native/langchain) for deployment on native Gaudi without any serving framework. + +## Predict +Please set up the environment variables first. +``` +export ENDPOINT="http://{host_ip}:8080/generate" # your LLM serving endpoint +export LLM_MODEL="meta-llama/Llama-2-7b-hf" +export BACKEND="tgi" # "tgi" or "llm" +export DATASET="narrativeqa" # can refer to https://github.com/THUDM/LongBench/blob/main/task.md for full list +export MAX_INPUT_LENGTH=2048 # specify the max input length according to llm services +``` +Then get the prediction on the dataset. +``` +python pred.py \ + --endpoint ${ENDPOINT} \ + --model_name ${LLM_MODEL} \ + --backend ${BACKEND} \ + --dataset ${DATASET} \ + --max_input_length ${MAX_INPUT_LENGTH} +``` +The prediction will be saved to "pred/{LLM_MODEL}/{DATASET.jsonl}". + +## Evalute +Evalute the prediction with LongBench metrics. +``` +git clone https://github.com/THUDM/LongBench +cd LongBench +pip install -r requirements.txt +python eval.py --model ${LLM_MODEL} +``` +Then evaluted result will be saved to "pred/{LLM_MODEL}/{result.jsonl}". \ No newline at end of file diff --git a/evals/evaluation/longbench/pred.py b/evals/evaluation/longbench/pred.py new file mode 100644 index 00000000..9bee8b11 --- /dev/null +++ b/evals/evaluation/longbench/pred.py @@ -0,0 +1,105 @@ +import os +from datasets import load_dataset +import json +from transformers import AutoTokenizer +from tqdm import tqdm +import numpy as np +import random +import argparse +import time +import requests +from requests.exceptions import RequestException + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument('--endpoint', type=str, required=True) + parser.add_argument('--model_name', type=str, required=True) + parser.add_argument('--backend', type=str, default="tgi", choices=["tgi","llm"]) + parser.add_argument('--dataset', type=str, help="give dataset name, if not given, will evaluate on all datasets", default=None) + parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E") + parser.add_argument('--max_input_length', type=int, default=2048, help="max input length") + return parser.parse_args(args) + +def get_query(backend, prompt, max_new_length): + header = {"Content-Type": "application/json"} + query = { + "tgi": {"inputs": prompt, "parameters": {"max_new_tokens":max_new_length, "do_sample": False}}, + "llm": {"query": prompt, "max_tokens":max_new_length} + } + return header, query[backend] + +def get_pred(data, dataset_name, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path): + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + for json_obj in tqdm(data): + prompt = prompt_format.format(**json_obj) + + # truncate to fit max_input_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions) + tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] + if len(tokenized_prompt) > max_input_length: + half = int(max_input_length/2) + prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) + + header, query = get_query(backend, prompt, max_new_length) + print("query: ", query) + try: + start_time = time.perf_counter() + res = requests.post(endpoint, headers=header, json=query) + res.raise_for_status() + res = res.json() + cost = time.perf_counter() - start_time + except RequestException as e: + raise Exception(f"An unexpected error occurred: {str(e)}") + + if backend == "tgi": + result = res["generated_text"] + else: + result = res["text"] + print("result: ", result) + with open(out_path, "a", encoding="utf-8") as f: + json.dump({"pred": result, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"], "length": json_obj["length"]}, f, ensure_ascii=False) + f.write('\n') + +if __name__ == '__main__': + args = parse_args() + endpoint = args.endpoint + model_name = args.model_name + backend = args.backend + dataset = args.dataset + max_input_length=args.max_input_length + + dataset_list = ["narrativeqa", "qasper", "multifieldqa_en", "multifieldqa_zh", "hotpotqa", "2wikimqa", "musique", \ + "dureader", "gov_report", "qmsum", "multi_news", "vcsum", "trec", "triviaqa", "samsum", "lsht", \ + "passage_count", "passage_retrieval_en", "passage_retrieval_zh", "lcc", "repobench-p"] + datasets_e_list = ["qasper", "multifieldqa_en", "hotpotqa", "2wikimqa", "gov_report", "multi_news", \ + "trec", "triviaqa", "samsum", "passage_count", "passage_retrieval_en", "lcc", "repobench-p"] + if args.e: + if dataset is not None: + if dataset in datasets_e_list: + datasets = [dataset] + else: + raise NotImplementedError(f"{dataset} are not supported in LongBench-e dataset list: {datasets_e_list}") + else: + datasets = datasets_e_list + if not os.path.exists(f"pred_e/{model_name}"): + os.makedirs(f"pred_e/{model_name}") + else: + datasets = [dataset] if dataset is not None else dataset_list + if not os.path.exists(f"pred/{model_name}"): + os.makedirs(f"pred/{model_name}") + + for dataset in datasets: + if args.e: + out_path = f"pred_e/{model_name}/{dataset}.jsonl" + data = load_dataset('THUDM/LongBench', f"{dataset}_e", split='test') + else: + out_path = f"pred/{model_name}/{dataset}.jsonl" + data = load_dataset('THUDM/LongBench', dataset, split='test') + + # we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output + dataset2prompt = json.load(open("config/dataset2prompt.json", "r")) + dataset2maxlen = json.load(open("config/dataset2maxlen.json", "r")) + prompt_format = dataset2prompt[dataset] + max_new_length = dataset2maxlen[dataset] + + data_all = [data_sample for data_sample in data] + get_pred(data_all, dataset, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path) \ No newline at end of file From ee4417de3aa547df45fe11933b298912ebeadeea Mon Sep 17 00:00:00 2001 From: Xinyao Wang Date: Mon, 28 Oct 2024 13:31:41 +0800 Subject: [PATCH 2/3] refine readme Signed-off-by: Xinyao Wang --- evals/evaluation/longbench/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/evaluation/longbench/README.md b/evals/evaluation/longbench/README.md index 3b7442f3..150f2d37 100644 --- a/evals/evaluation/longbench/README.md +++ b/evals/evaluation/longbench/README.md @@ -55,7 +55,7 @@ python pred.py \ ``` The prediction will be saved to "pred/{LLM_MODEL}/{DATASET.jsonl}". -## Evalute +## Evaluate Evalute the prediction with LongBench metrics. ``` git clone https://github.com/THUDM/LongBench From c7501d9070e5f37b6ee0269bed4363daef4fa4d1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 06:21:43 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/evaluation/longbench/README.md | 6 +- evals/evaluation/longbench/pred.py | 124 ++++++++++++++++++++------- 2 files changed, 94 insertions(+), 36 deletions(-) diff --git a/evals/evaluation/longbench/README.md b/evals/evaluation/longbench/README.md index 150f2d37..b6765c2c 100644 --- a/evals/evaluation/longbench/README.md +++ b/evals/evaluation/longbench/README.md @@ -1,6 +1,6 @@ [LongBench](https://github.com/THUDM/LongBench) is the benchmark for bilingual, multitask, and comprehensive assessment of long context understanding capabilities of large language models. LongBench includes different languages (Chinese and English) to provide a more comprehensive evaluation of the large models' multilingual capabilities on long contexts. In addition, LongBench is composed of six major categories and twenty one different tasks, covering key long-text application scenarios such as single-document QA, multi-document QA, summarization, few-shot learning, synthetic tasks and code completion. -In this guideline, we evalute LongBench dataset with OPEA services on Intel hardwares. +In this guideline, we evaluate LongBench dataset with OPEA services on Intel hardwares. # 🚀 QuickStart @@ -56,11 +56,11 @@ python pred.py \ The prediction will be saved to "pred/{LLM_MODEL}/{DATASET.jsonl}". ## Evaluate -Evalute the prediction with LongBench metrics. +Evaluate the prediction with LongBench metrics. ``` git clone https://github.com/THUDM/LongBench cd LongBench pip install -r requirements.txt python eval.py --model ${LLM_MODEL} ``` -Then evaluted result will be saved to "pred/{LLM_MODEL}/{result.jsonl}". \ No newline at end of file +Then evaluated result will be saved to "pred/{LLM_MODEL}/{result.jsonl}". diff --git a/evals/evaluation/longbench/pred.py b/evals/evaluation/longbench/pred.py index 9bee8b11..b30e079f 100644 --- a/evals/evaluation/longbench/pred.py +++ b/evals/evaluation/longbench/pred.py @@ -1,34 +1,45 @@ -import os -from datasets import load_dataset +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse import json -from transformers import AutoTokenizer -from tqdm import tqdm -import numpy as np +import os import random -import argparse import time + +import numpy as np import requests +from datasets import load_dataset from requests.exceptions import RequestException +from tqdm import tqdm +from transformers import AutoTokenizer + def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('--endpoint', type=str, required=True) - parser.add_argument('--model_name', type=str, required=True) - parser.add_argument('--backend', type=str, default="tgi", choices=["tgi","llm"]) - parser.add_argument('--dataset', type=str, help="give dataset name, if not given, will evaluate on all datasets", default=None) - parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E") - parser.add_argument('--max_input_length', type=int, default=2048, help="max input length") + parser.add_argument("--endpoint", type=str, required=True) + parser.add_argument("--model_name", type=str, required=True) + parser.add_argument("--backend", type=str, default="tgi", choices=["tgi", "llm"]) + parser.add_argument( + "--dataset", type=str, help="give dataset name, if not given, will evaluate on all datasets", default=None + ) + parser.add_argument("--e", action="store_true", help="Evaluate on LongBench-E") + parser.add_argument("--max_input_length", type=int, default=2048, help="max input length") return parser.parse_args(args) + def get_query(backend, prompt, max_new_length): header = {"Content-Type": "application/json"} query = { - "tgi": {"inputs": prompt, "parameters": {"max_new_tokens":max_new_length, "do_sample": False}}, - "llm": {"query": prompt, "max_tokens":max_new_length} + "tgi": {"inputs": prompt, "parameters": {"max_new_tokens": max_new_length, "do_sample": False}}, + "llm": {"query": prompt, "max_tokens": max_new_length}, } return header, query[backend] -def get_pred(data, dataset_name, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path): + +def get_pred( + data, dataset_name, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path +): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) for json_obj in tqdm(data): prompt = prompt_format.format(**json_obj) @@ -36,9 +47,11 @@ def get_pred(data, dataset_name, backend, endpoint, model_name, max_input_length # truncate to fit max_input_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions) tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] if len(tokenized_prompt) > max_input_length: - half = int(max_input_length/2) - prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) - + half = int(max_input_length / 2) + prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True) + tokenizer.decode( + tokenized_prompt[-half:], skip_special_tokens=True + ) + header, query = get_query(backend, prompt, max_new_length) print("query: ", query) try: @@ -51,31 +64,74 @@ def get_pred(data, dataset_name, backend, endpoint, model_name, max_input_length raise Exception(f"An unexpected error occurred: {str(e)}") if backend == "tgi": - result = res["generated_text"] + result = res["generated_text"] else: result = res["text"] print("result: ", result) with open(out_path, "a", encoding="utf-8") as f: - json.dump({"pred": result, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"], "length": json_obj["length"]}, f, ensure_ascii=False) - f.write('\n') + json.dump( + { + "pred": result, + "answers": json_obj["answers"], + "all_classes": json_obj["all_classes"], + "length": json_obj["length"], + }, + f, + ensure_ascii=False, + ) + f.write("\n") -if __name__ == '__main__': + +if __name__ == "__main__": args = parse_args() endpoint = args.endpoint model_name = args.model_name backend = args.backend dataset = args.dataset - max_input_length=args.max_input_length + max_input_length = args.max_input_length - dataset_list = ["narrativeqa", "qasper", "multifieldqa_en", "multifieldqa_zh", "hotpotqa", "2wikimqa", "musique", \ - "dureader", "gov_report", "qmsum", "multi_news", "vcsum", "trec", "triviaqa", "samsum", "lsht", \ - "passage_count", "passage_retrieval_en", "passage_retrieval_zh", "lcc", "repobench-p"] - datasets_e_list = ["qasper", "multifieldqa_en", "hotpotqa", "2wikimqa", "gov_report", "multi_news", \ - "trec", "triviaqa", "samsum", "passage_count", "passage_retrieval_en", "lcc", "repobench-p"] + dataset_list = [ + "narrativeqa", + "qasper", + "multifieldqa_en", + "multifieldqa_zh", + "hotpotqa", + "2wikimqa", + "musique", + "dureader", + "gov_report", + "qmsum", + "multi_news", + "vcsum", + "trec", + "triviaqa", + "samsum", + "lsht", + "passage_count", + "passage_retrieval_en", + "passage_retrieval_zh", + "lcc", + "repobench-p", + ] + datasets_e_list = [ + "qasper", + "multifieldqa_en", + "hotpotqa", + "2wikimqa", + "gov_report", + "multi_news", + "trec", + "triviaqa", + "samsum", + "passage_count", + "passage_retrieval_en", + "lcc", + "repobench-p", + ] if args.e: if dataset is not None: if dataset in datasets_e_list: - datasets = [dataset] + datasets = [dataset] else: raise NotImplementedError(f"{dataset} are not supported in LongBench-e dataset list: {datasets_e_list}") else: @@ -90,11 +146,11 @@ def get_pred(data, dataset_name, backend, endpoint, model_name, max_input_length for dataset in datasets: if args.e: out_path = f"pred_e/{model_name}/{dataset}.jsonl" - data = load_dataset('THUDM/LongBench', f"{dataset}_e", split='test') + data = load_dataset("THUDM/LongBench", f"{dataset}_e", split="test") else: out_path = f"pred/{model_name}/{dataset}.jsonl" - data = load_dataset('THUDM/LongBench', dataset, split='test') - + data = load_dataset("THUDM/LongBench", dataset, split="test") + # we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output dataset2prompt = json.load(open("config/dataset2prompt.json", "r")) dataset2maxlen = json.load(open("config/dataset2maxlen.json", "r")) @@ -102,4 +158,6 @@ def get_pred(data, dataset_name, backend, endpoint, model_name, max_input_length max_new_length = dataset2maxlen[dataset] data_all = [data_sample for data_sample in data] - get_pred(data_all, dataset, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path) \ No newline at end of file + get_pred( + data_all, dataset, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path + )