Skip to content

Commit

Permalink
[Sync] format (#1214)
Browse files Browse the repository at this point in the history
  • Loading branch information
Leymore authored May 29, 2024
1 parent d59189b commit a77b8a5
Show file tree
Hide file tree
Showing 9 changed files with 561 additions and 9 deletions.
58 changes: 58 additions & 0 deletions configs/datasets/subjective/compassbench/compassbench_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassBenchDataset

subjective_reader_cfg = dict(
input_columns=['question', 'judge_prompt'],
output_column='judge',
)

data_path ='data/subjective/compassbench'

subjective_datasets = []

versions = ['CompassbenchV1']

for version_abbr in versions:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
)

subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{judge_prompt}'
),
]),
),
),
pred_role='BOT',
)

subjective_datasets.append(
dict(
abbr=version_abbr,
type=CompassBenchDataset,
path=data_path,
name=version_abbr,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
))
137 changes: 137 additions & 0 deletions configs/eval_subjective_compassbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from os import getenv as gv
from opencompass.models import HuggingFaceCausalLM
from mmengine.config import read_base

with read_base():
from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets

from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import CompassBenchSummarizer

api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)

# -------------Inference Stage ----------------------------------------

from opencompass.models import HuggingFacewithChatTemplate

models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm2-chat-7b-hf',
path='internlm/internlm2-chat-7b',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
stop_words=['</s>', '<|im_end|>'],
generation_kwargs=dict(
do_sample=True,
),
)
]

datasets = [*subjective_datasets]

infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='reserved',
max_num_workers=256,
task=dict(type=OpenICLInferTask),
),
)

gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=4,
retry=20,
temperature=1,
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions

# -------------Evalation Stage ----------------------------------------

## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=2,
retry=20,
temperature=0,
)]

judge_models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm102b',
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
stop_words=['</s>', '<|im_end|>'],
),
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm102b2',
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
stop_words=['</s>', '<|im_end|>'],
),
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm102b3',
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
stop_words=['</s>', '<|im_end|>'],
)
]

## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
strategy='split',
max_task_size=10000000,
mode='m2n',
infer_order='double',
base_models=[gpt4],
compare_models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
#given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
)

work_dir = 'outputs/compassbench/'

summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add')
8 changes: 4 additions & 4 deletions configs/summarizers/groups/charm_reason.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@
]


charm_reaso_summary_groups = []
charm_reason_summary_groups = []
for prompt in prompts:
for region in regions:
subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks]
charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
charm_reason_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})

for prompt in prompts:
subsets = ['charm-reason-' + region + '_' + prompt for region in regions]
charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
charm_reason_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})

charm_reaso_summary_groups.append(
charm_reason_summary_groups.append(
{'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']}
)
1 change: 1 addition & 0 deletions opencompass/datasets/subjective/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .alignbench import AlignmentBenchDataset # noqa: F401, F403
from .arena_hard import ArenaHardDataset # noqa: F401, F403
from .compass_arena import CompassArenaDataset # noqa: F401, F403
from .compassbench import CompassBenchDataset # noqa: F401, F403
from .corev2 import Corev2Dataset # noqa: F401, F403
from .creationbench import CreationBenchDataset # noqa: F401, F403
from .information_retrival import IRDataset # noqa: F401, F403
Expand Down
101 changes: 101 additions & 0 deletions opencompass/datasets/subjective/compassbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# flake8: noqa
import json
import os.path as osp

from datasets import Dataset

from opencompass.registry import LOAD_DATASET

from ..base import BaseDataset

base_prompt_zh = """请根据 用户问题 以及 相应的两个回答,判断哪一个回答更好。
[用户问题]
{question}
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求,请先对两个回答进行评价,最后在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答1、2平局
如果你认为回答1更好,你的输出应形如:
评价1:回答1 xxx
评价2:回答2 xxx
选择:[[A]]
如果你认为回答2更好,你的输出应形如:
评价1:回答1 xxx
评价2:回答2 xxx
选择:[[B]]
如果你认为回答1、2打成平手,你的输出应形如:
评价1:回答1 xxx
评价2:回答2 xxx
选择:[[C]]
"""

base_prompt_en = """Please evaluate the two responses based on the user's question and then choose from the following three options:
A. Response 1 is better
B. Response 2 is better
C. Both responses are equal
[user's question]
{question}
[Response 1 Start]
{prediction}
[Response 1 End]
[Response 2 Start]
{prediction2}
[Response 2 End]
If you believe that Response 1 is better, your output should be formatted as follows:
Evaluation 1: Response 1 xxx
Evaluation 2: Response 2 xxx
Choice: [[A]]
If you believe that Response 2 is better, your output should be formatted as follows:
Evaluation 1: Response 1 xxx
Evaluation 2: Response 2 xxx
Choice: [[B]]
If you believe that both responses are equally good, your output should be formatted as follows:
Evaluation 1: Response 1 xxx
Evaluation 2: Response 2 xxx
Choice: [[C]]
"""


@LOAD_DATASET.register_module()
class CompassBenchDataset(BaseDataset):

def load(self, path: str, name: str):
filename = osp.join(path, f'{name}.json')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
json_data = json.load(f)
for problem in json_data:
question = problem['question']
lan = problem['language']
others = problem['others']
judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en
raw_data.append({
'question': question,
'judge_prompt': judge_prompt,
'judge': {
'lan': lan,
'level': others['level'],
'category': problem['category'],
'question': question
}
})
dataset = Dataset.from_list(raw_data)
return dataset
1 change: 1 addition & 0 deletions opencompass/summarizers/subjective/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .alpacaeval import AlpacaSummarizer
from .arenahard import ArenaHardSummarizer
from .compass_arena import CompassArenaSummarizer
from .compassbench import CompassBenchSummarizer
from .corev2 import Corev2Summarizer
from .creationbench import CreationBenchSummarizer
from .flames import FlamesSummarizer
Expand Down
Loading

0 comments on commit a77b8a5

Please sign in to comment.