From 963c44c431679fc9f36e9355d1cf7908aa23df89 Mon Sep 17 00:00:00 2001 From: XinyaoWa Date: Tue, 29 Oct 2024 16:45:25 +0800 Subject: [PATCH] Support Longbench (#179) * add longbench Signed-off-by: Xinyao Wang * refine readme Signed-off-by: Xinyao Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xinyao Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Howard Yen --- evals/evaluation/longbench/README.md | 66 +++++++++++ evals/evaluation/longbench/pred.py | 163 +++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 evals/evaluation/longbench/README.md create mode 100644 evals/evaluation/longbench/pred.py diff --git a/evals/evaluation/longbench/README.md b/evals/evaluation/longbench/README.md new file mode 100644 index 00000000..b6765c2c --- /dev/null +++ b/evals/evaluation/longbench/README.md @@ -0,0 +1,66 @@ +[LongBench](https://github.com/THUDM/LongBench) is the benchmark for bilingual, multitask, and comprehensive assessment of long context understanding capabilities of large language models. LongBench includes different languages (Chinese and English) to provide a more comprehensive evaluation of the large models' multilingual capabilities on long contexts. In addition, LongBench is composed of six major categories and twenty one different tasks, covering key long-text application scenarios such as single-document QA, multi-document QA, summarization, few-shot learning, synthetic tasks and code completion. + +In this guideline, we evaluate LongBench dataset with OPEA services on Intel hardwares. + +# 🚀 QuickStart + +## Installation + +``` +pip install ../../../requirements.txt +``` + +## Launch a LLM Service + +To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-gaudi) or [OPEA microservices](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation) to launch a service. + +### Example 1: TGI +For example, the follow command is to setup the [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) model on Gaudi: + +``` +model=meta-llama/Llama-2-7b-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all \ +-e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN=$hf_token \ +-e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true \ +-e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host \ +ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model --max-input-tokens 1024 \ +--max-total-tokens 2048 +``` + +### Example 2: OPEA LLM +You can also set up a service with OPEA microservices. + +For example, you can refer to [native LLM](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/native/langchain) for deployment on native Gaudi without any serving framework. + +## Predict +Please set up the environment variables first. +``` +export ENDPOINT="http://{host_ip}:8080/generate" # your LLM serving endpoint +export LLM_MODEL="meta-llama/Llama-2-7b-hf" +export BACKEND="tgi" # "tgi" or "llm" +export DATASET="narrativeqa" # can refer to https://github.com/THUDM/LongBench/blob/main/task.md for full list +export MAX_INPUT_LENGTH=2048 # specify the max input length according to llm services +``` +Then get the prediction on the dataset. +``` +python pred.py \ + --endpoint ${ENDPOINT} \ + --model_name ${LLM_MODEL} \ + --backend ${BACKEND} \ + --dataset ${DATASET} \ + --max_input_length ${MAX_INPUT_LENGTH} +``` +The prediction will be saved to "pred/{LLM_MODEL}/{DATASET.jsonl}". + +## Evaluate +Evaluate the prediction with LongBench metrics. +``` +git clone https://github.com/THUDM/LongBench +cd LongBench +pip install -r requirements.txt +python eval.py --model ${LLM_MODEL} +``` +Then evaluated result will be saved to "pred/{LLM_MODEL}/{result.jsonl}". diff --git a/evals/evaluation/longbench/pred.py b/evals/evaluation/longbench/pred.py new file mode 100644 index 00000000..b30e079f --- /dev/null +++ b/evals/evaluation/longbench/pred.py @@ -0,0 +1,163 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os +import random +import time + +import numpy as np +import requests +from datasets import load_dataset +from requests.exceptions import RequestException +from tqdm import tqdm +from transformers import AutoTokenizer + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument("--endpoint", type=str, required=True) + parser.add_argument("--model_name", type=str, required=True) + parser.add_argument("--backend", type=str, default="tgi", choices=["tgi", "llm"]) + parser.add_argument( + "--dataset", type=str, help="give dataset name, if not given, will evaluate on all datasets", default=None + ) + parser.add_argument("--e", action="store_true", help="Evaluate on LongBench-E") + parser.add_argument("--max_input_length", type=int, default=2048, help="max input length") + return parser.parse_args(args) + + +def get_query(backend, prompt, max_new_length): + header = {"Content-Type": "application/json"} + query = { + "tgi": {"inputs": prompt, "parameters": {"max_new_tokens": max_new_length, "do_sample": False}}, + "llm": {"query": prompt, "max_tokens": max_new_length}, + } + return header, query[backend] + + +def get_pred( + data, dataset_name, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path +): + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + for json_obj in tqdm(data): + prompt = prompt_format.format(**json_obj) + + # truncate to fit max_input_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions) + tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] + if len(tokenized_prompt) > max_input_length: + half = int(max_input_length / 2) + prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True) + tokenizer.decode( + tokenized_prompt[-half:], skip_special_tokens=True + ) + + header, query = get_query(backend, prompt, max_new_length) + print("query: ", query) + try: + start_time = time.perf_counter() + res = requests.post(endpoint, headers=header, json=query) + res.raise_for_status() + res = res.json() + cost = time.perf_counter() - start_time + except RequestException as e: + raise Exception(f"An unexpected error occurred: {str(e)}") + + if backend == "tgi": + result = res["generated_text"] + else: + result = res["text"] + print("result: ", result) + with open(out_path, "a", encoding="utf-8") as f: + json.dump( + { + "pred": result, + "answers": json_obj["answers"], + "all_classes": json_obj["all_classes"], + "length": json_obj["length"], + }, + f, + ensure_ascii=False, + ) + f.write("\n") + + +if __name__ == "__main__": + args = parse_args() + endpoint = args.endpoint + model_name = args.model_name + backend = args.backend + dataset = args.dataset + max_input_length = args.max_input_length + + dataset_list = [ + "narrativeqa", + "qasper", + "multifieldqa_en", + "multifieldqa_zh", + "hotpotqa", + "2wikimqa", + "musique", + "dureader", + "gov_report", + "qmsum", + "multi_news", + "vcsum", + "trec", + "triviaqa", + "samsum", + "lsht", + "passage_count", + "passage_retrieval_en", + "passage_retrieval_zh", + "lcc", + "repobench-p", + ] + datasets_e_list = [ + "qasper", + "multifieldqa_en", + "hotpotqa", + "2wikimqa", + "gov_report", + "multi_news", + "trec", + "triviaqa", + "samsum", + "passage_count", + "passage_retrieval_en", + "lcc", + "repobench-p", + ] + if args.e: + if dataset is not None: + if dataset in datasets_e_list: + datasets = [dataset] + else: + raise NotImplementedError(f"{dataset} are not supported in LongBench-e dataset list: {datasets_e_list}") + else: + datasets = datasets_e_list + if not os.path.exists(f"pred_e/{model_name}"): + os.makedirs(f"pred_e/{model_name}") + else: + datasets = [dataset] if dataset is not None else dataset_list + if not os.path.exists(f"pred/{model_name}"): + os.makedirs(f"pred/{model_name}") + + for dataset in datasets: + if args.e: + out_path = f"pred_e/{model_name}/{dataset}.jsonl" + data = load_dataset("THUDM/LongBench", f"{dataset}_e", split="test") + else: + out_path = f"pred/{model_name}/{dataset}.jsonl" + data = load_dataset("THUDM/LongBench", dataset, split="test") + + # we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output + dataset2prompt = json.load(open("config/dataset2prompt.json", "r")) + dataset2maxlen = json.load(open("config/dataset2maxlen.json", "r")) + prompt_format = dataset2prompt[dataset] + max_new_length = dataset2maxlen[dataset] + + data_all = [data_sample for data_sample in data] + get_pred( + data_all, dataset, backend, endpoint, model_name, max_input_length, max_new_length, prompt_format, out_path + )