From f9151d7d811a22b10d5f090b18115fffbc121c0e Mon Sep 17 00:00:00 2001 From: qbc Date: Tue, 4 Jul 2023 10:26:02 +0800 Subject: [PATCH 01/23] update readme for docker (#654) --- .../llm/eval/eval_for_helm/README.md | 23 ++++++++++++------- .../federatedscope-torch2.0-helm.Dockerfile | 1 + 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/federatedscope/llm/eval/eval_for_helm/README.md b/federatedscope/llm/eval/eval_for_helm/README.md index 1129e0c01..eb25c34d5 100644 --- a/federatedscope/llm/eval/eval_for_helm/README.md +++ b/federatedscope/llm/eval/eval_for_helm/README.md @@ -3,8 +3,8 @@ ## Docker * Build images: - * Build from Dockerfile: `docker build -f federatedscope-torch2.0-helm.Dockerfile -t alibaba/federatedscope:helm .` - * Pull from docker hub: `TBD` + * Build from Dockerfile: `docker build -f federatedscope-torch2.0-helm.Dockerfile -t alibaba/federatedscope:fs_helm .` + * Pull from docker hub: `docker pull fsteam/federatedscope:fs_helm` * Download Helm evaluation dataset @@ -18,27 +18,27 @@ * Launch and mapping dataset and FS ```bash - docker run -u root: --gpus device=all -it --rm \ + docker run -p ${PORT}:${DOCKER_PORT} -u root: --gpus device=all -it --rm \ -v "${PATH_TO_HELM_DATA}/helm_data/benchmark_output:/root/src/helm/benchmark_output" \ -v "${PATH_TO_HELM_DATA}/helm_data/nltk_data:/root/nltk_data" \ -v "${PATH_TO_HELM_DATA}/helm_data/prompt_construction_settings.json:/tmp/prompt_construction_settings.json" \ -v "${PATH_TO_FS}:/root/FederatedScope" \ -v "${PATH_TO_CACHE}:/root/.cache" \ -w '/root/FederatedScope' \ - --name "helm_fs" alibaba/federatedscope:helm /bin/bash + --name "helm_fs" alibaba/federatedscope:fs_helm /bin/bash ``` Example for a root user: ```bash - docker run -u root: --gpus device=all -it --rm \ + docker run -p 8000:8000 -u root: --gpus device=all -it --rm \ -v "/root/helm_fs/helm_data/benchmark_output:/root/src/helm/benchmark_output" \ -v "/root/helm_fs/helm_data/nltk_data:/root/nltk_data" \ -v "/root/helm_fs/helm_data/prompt_construction_settings.json:/tmp/prompt_construction_settings.json" \ -v "/root/helm_fs/FederatedScope:/root/FederatedScope" \ -v "/root/.cache:/root/.cache" \ -w '/root/FederatedScope' \ - --name "helm_fs" alibaba/federatedscope:helm /bin/bash + --name "helm_fs" alibaba/federatedscope:fs_helm /bin/bash ``` * Install FS in container @@ -47,11 +47,18 @@ * Move to helm - * `cd /root/helm` + * `cd /root/src/crfm-helm` * Start to evaluate - * `helm-run --conf-paths federatedscope/llm/eval/eval_for_helm/run_specs.conf --enable-local-huggingface-model decapoda-research/llama-7b-hf --suite test -m 100 --local -n 1 --yaml federatedscope/llm/baseline/llama.yaml` + * `helm-run --conf-paths federatedscope/llm/eval/eval_for_helm/run_specs.conf --enable-local-huggingface-model decapoda-research/llama-7b-hf --suite ${SUITE_NAME} -m 100 --local -n 1` + + The above code will evaluate the model `decapoda-research/llama-7b-hf` and save the results in `${SUITE_NAME}`. + * If you want to test your own trained `ckpt` for `decapoda-research/llama-7b-hf`, please add parameters `--yaml /path/to/xxx.yaml` and `--ckpt_dir /dir/of/saved/ckpt` + * `bash evaluaton/setup_server.sh -n ${SUITE_NAME} -p ${PORT}` + + Run the above code and view the results on port `${PORT}`. + * Remark: Actually, it will always show the results of the last task. If you want to see the results of another task, say, the suite name is `result_of_exp1`, add `?suite=result_of_exp1` after the port address. ## Conda diff --git a/federatedscope/llm/eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile b/federatedscope/llm/eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile index 85e89c8e8..f835bfb5e 100644 --- a/federatedscope/llm/eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile +++ b/federatedscope/llm/eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile @@ -34,4 +34,5 @@ RUN conda update -y conda \ && conda config --add channels conda-forge # Install helm +cd /root/helm_fs RUN pip install -e git+https://github.com/qbc2016/helm.git@helm_for_fs#egg=crfm-helm From 3632d6fb530910ce3b32c999578b7548fa740a7e Mon Sep 17 00:00:00 2001 From: qbc Date: Tue, 4 Jul 2023 17:32:39 +0800 Subject: [PATCH 02/23] Update docker readme (#655) --- .../eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/federatedscope/llm/eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile b/federatedscope/llm/eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile index f835bfb5e..54d47b11c 100644 --- a/federatedscope/llm/eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile +++ b/federatedscope/llm/eval/eval_for_helm/federatedscope-torch2.0-helm.Dockerfile @@ -34,5 +34,6 @@ RUN conda update -y conda \ && conda config --add channels conda-forge # Install helm -cd /root/helm_fs +RUN mkdir /root/helm_fs \ + && cd /root/helm_fs RUN pip install -e git+https://github.com/qbc2016/helm.git@helm_for_fs#egg=crfm-helm From eddd176814df5c274770c095835cbf02044f5c8d Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Wed, 5 Jul 2023 20:11:11 +0800 Subject: [PATCH 03/23] LLM readme & Dockerfile (#657) --- federatedscope/core/configs/cfg_llm.py | 9 +++ federatedscope/llm/dataloader/dataloader.py | 7 ++- .../misc/federatedscope-torch2.0.Dockerfile | 59 +++++++++++++++++++ federatedscope/llm/model/model_builder.py | 6 +- 4 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 federatedscope/llm/misc/federatedscope-torch2.0.Dockerfile diff --git a/federatedscope/core/configs/cfg_llm.py b/federatedscope/core/configs/cfg_llm.py index 453d98676..98aff1eba 100644 --- a/federatedscope/core/configs/cfg_llm.py +++ b/federatedscope/core/configs/cfg_llm.py @@ -13,6 +13,15 @@ def extend_llm_cfg(cfg): cfg.llm = CN() cfg.llm.tok_len = 128 + # ---------------------------------------------------------------------- # + # Cache for LLM + # ---------------------------------------------------------------------- # + cfg.llm.cache = CN() + cfg.llm.cache.model = '' + + # ---------------------------------------------------------------------- # + # Chat tools for LLM + # ---------------------------------------------------------------------- # cfg.llm.chat = CN() cfg.llm.chat.max_history_len = 10 cfg.llm.chat.max_len = 100 diff --git a/federatedscope/llm/dataloader/dataloader.py b/federatedscope/llm/dataloader/dataloader.py index 412af8dca..f6bd94fc3 100644 --- a/federatedscope/llm/dataloader/dataloader.py +++ b/federatedscope/llm/dataloader/dataloader.py @@ -213,9 +213,10 @@ def load_llm_dataset(config=None, **kwargs): elif dataset_name.lower() == 'rosetta_alpaca': fp = os.path.join(config.data.root, 'rosetta_alpaca.json') download_url( - 'https://github.com/sahil280114/' - 'codealpaca/raw/master/data/' - 'rosetta_alpaca.json', config.data.root) + 'https://raw.githubusercontent.com/' + 'sahil280114/codealpaca/' + 'd269da106a579a623a654529b3cb91b5dfa9c72f/' + 'data/rosetta_alpaca.json', config.data.root) list_data_dict = load_json(fp, instruction='instruction', input='input', diff --git a/federatedscope/llm/misc/federatedscope-torch2.0.Dockerfile b/federatedscope/llm/misc/federatedscope-torch2.0.Dockerfile new file mode 100644 index 000000000..a93da5a94 --- /dev/null +++ b/federatedscope/llm/misc/federatedscope-torch2.0.Dockerfile @@ -0,0 +1,59 @@ +# The federatedscope image includes all runtime stuffs of federatedscope, +# with customized miniconda and required packages installed. + +# based on the nvidia-docker +# NOTE: please pre-install the NVIDIA drivers and `nvidia-docker2` in the host machine, +# see details in https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html +ARG ROOT_CONTAINER=nvidia/cuda:11.7.0-runtime-ubuntu20.04 + +FROM $ROOT_CONTAINER + +# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +# shanghai zoneinfo +ENV TZ=Asia/Shanghai +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +# install basic tools +RUN apt-get -y update \ + && apt-get -y install curl git gcc g++ make openssl libssl-dev libbz2-dev libreadline-dev libsqlite3-dev python-dev libmysqlclient-dev + +# install miniconda, in batch (silent) mode, does not edit PATH or .bashrc or .bash_profile +RUN apt-get update -y \ + && apt-get install -y wget +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.1.0-1-Linux-x86_64.sh \ + && bash Miniconda3-py39_23.1.0-1-Linux-x86_64.sh -b \ + && rm Miniconda3-py39_23.1.0-1-Linux-x86_64.sh + +ENV PATH=/root/miniconda3/bin:${PATH} +RUN source activate + +RUN conda update -y conda \ + && conda config --add channels conda-forge + +# Install torch +RUN conda install -y pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia \ + && conda clean -a -y + +# Install FS-LLM +RUN cd /root \ + && git clone -b dev/llm https://github.com/alibaba/FederatedScope.git \ + && cd /root/FederatedScope \ + && pip install -e .[llm] \ + && pip cache purge + +# Prepare datas +RUN mkdir /root/FederatedScope/data \ + && cd /root/FederatedScope/data \ + && wget https://raw.githubusercontent.com/databrickslabs/dolly/d000e3030970379aabbf6d291f50ffdd3b715b64/data/databricks-dolly-15k.jsonl \ + && wget https://raw.githubusercontent.com/openai/grade-school-math/3101c7d5072418e28b9008a6636bde82a006892c/grade_school_math/data/train.jsonl -O gsm8k_train.jsonl \ + && wget https://raw.githubusercontent.com/openai/grade-school-math/2909d34ef28520753df82a2234c357259d254aa8/grade_school_math/data/test.jsonl -O gsm8k_test.jsonl \ + && wget https://raw.githubusercontent.com/sahil280114/codealpaca/d269da106a579a623a654529b3cb91b5dfa9c72f/data/rosetta_alpaca.json + +# Prepare Evaluation +RUN cd /root/FederatedScope \ + && git clone https://github.com/openai/human-eval \ + && pip install -e human-eval \ + && pip cache purge \ No newline at end of file diff --git a/federatedscope/llm/model/model_builder.py b/federatedscope/llm/model/model_builder.py index 4aa305d0e..49c8f0f53 100644 --- a/federatedscope/llm/model/model_builder.py +++ b/federatedscope/llm/model/model_builder.py @@ -4,7 +4,11 @@ def get_model_from_huggingface(model_name, config): from transformers import AutoModelForCausalLM - return AutoModelForCausalLM.from_pretrained(model_name) + kwargs = {} + if len(config.llm.cache.model): + kwargs['cache_dir'] = config.llm.cache.model + + return AutoModelForCausalLM.from_pretrained(model_name, **kwargs) def get_model_from_modelscope(model_name, config): From 3314d761716fab3a50c6aed2f452bbc4831ecf55 Mon Sep 17 00:00:00 2001 From: qbc Date: Wed, 5 Jul 2023 22:47:27 +0800 Subject: [PATCH 04/23] add prefix tuning, prompt tuning and p-tuning (#658) --- .../dolly_meta/dolly_meta_global.yaml | 43 +++++++++++++++++++ .../rosetta_9_clients/rosetta_global.yaml | 41 ++++++++++++++++++ federatedscope/llm/model/adapter_builder.py | 24 ++++++++--- 3 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml create mode 100644 federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_global.yaml diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml new file mode 100644 index 000000000..77bcf854e --- /dev/null +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml @@ -0,0 +1,43 @@ +use_gpu: True +device: 0 +early_stop: + patience: 0 +federate: + mode: standalone + client_num: 1 + total_round_num: 500 + save_to: "llama_dolly_meta_global_30*500_0.0005_64_0.1.ckpt" + save_freq: 100 + share_local_model: True + online_aggr: False +data: + root: data/ + type: 'dolly-15k@llm' + splits: [0.99, 0.0, 0.01] + splitter: 'meta' +llm: + tok_len: 650 + chat: + max_len: 1000 + adapter: + use: True + args: [ { 'adapter_package': 'peft', 'adapter_method': 'lora', 'r': 8, 'lora_alpha': 64, 'lora_dropout': 0.1 } ] +dataloader: + batch_size: 1 +model: + type: 'decapoda-research/llama-7b-hf@huggingface_llm' +train: + local_update_steps: 30 + batch_or_epoch: batch + optimizer: + lr: 0.005 + is_enable_half: True +criterion: + type: CrossEntropyLoss +trainer: + type: llmtrainer +eval: + freq: 50 + metrics: ['loss'] + split: ['test'] + best_res_update_round_wise_key: test_loss \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_global.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_global.yaml new file mode 100644 index 000000000..fc253106a --- /dev/null +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_global.yaml @@ -0,0 +1,41 @@ +use_gpu: True +device: 0 +early_stop: + patience: 0 +federate: + mode: standalone + client_num: 1 + total_round_num: 500 + save_to: "llama_rosetta_global_30*500_0.001_32_0.1.ckpt" + save_freq: 100 +data: + root: data/ + type: 'rosetta_alpaca@llm' + splits: [0.89,0.1,0.01] + splitter: 'meta' +llm: + tok_len: 512 + chat: + max_len: 1000 + adapter: + use: True + args: [ { 'adapter_package': 'peft', 'adapter_method': 'lora', 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.1 } ] +dataloader: + batch_size: 1 +model: + type: 'decapoda-research/llama-7b-hf@huggingface_llm' +train: + local_update_steps: 30 + batch_or_epoch: batch + optimizer: + lr: 0.001 + weight_decay: 0.0 + is_enable_half: True +criterion: + type: CrossEntropyLoss +trainer: + type: llmtrainer +eval: + freq: 50 + metrics: ['loss'] + best_res_update_round_wise_key: val_loss \ No newline at end of file diff --git a/federatedscope/llm/model/adapter_builder.py b/federatedscope/llm/model/adapter_builder.py index 52157dc34..f25002cdf 100644 --- a/federatedscope/llm/model/adapter_builder.py +++ b/federatedscope/llm/model/adapter_builder.py @@ -14,15 +14,25 @@ def enable_adapter(model, package, adapter, **kwargs): Prompt Tuning AdaLoRA """ - from peft import get_peft_model + from peft import get_peft_model, TaskType if adapter == 'lora': from peft import LoraConfig - r = kwargs.get('lora_r', 8) - lora_alpha = kwargs.get('lora_alpha', 32) - lora_dropout = kwargs.get('lora_dropout', 0.1) - peft_config = LoraConfig(r=r, - lora_alpha=lora_alpha, - lora_dropout=lora_dropout) + peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, **kwargs) + model = get_peft_model(model, peft_config) + elif adapter == 'prefix': + from peft import PrefixTuningConfig + peft_config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM, + **kwargs) + model = get_peft_model(model, peft_config) + elif adapter == 'prompt': + from peft import PromptTuningConfig + peft_config = PromptTuningConfig(task_type=TaskType.CAUSAL_LM, + **kwargs) + model = get_peft_model(model, peft_config) + elif adapter == 'p-tuning': + from peft import PromptEncoderConfig + peft_config = PromptEncoderConfig(task_type=TaskType.CAUSAL_LM, + **kwargs) model = get_peft_model(model, peft_config) else: raise NotImplementedError From 9cb009f40b97bbc31ec718bcd82e38dc3ab280e0 Mon Sep 17 00:00:00 2001 From: qbc Date: Thu, 6 Jul 2023 09:46:26 +0800 Subject: [PATCH 05/23] Update docker readme (#656) --- federatedscope/llm/eval/eval_for_helm/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/federatedscope/llm/eval/eval_for_helm/README.md b/federatedscope/llm/eval/eval_for_helm/README.md index eb25c34d5..0eb827093 100644 --- a/federatedscope/llm/eval/eval_for_helm/README.md +++ b/federatedscope/llm/eval/eval_for_helm/README.md @@ -3,7 +3,7 @@ ## Docker * Build images: - * Build from Dockerfile: `docker build -f federatedscope-torch2.0-helm.Dockerfile -t alibaba/federatedscope:fs_helm .` + * Build from Dockerfile: `docker build -f federatedscope-torch2.0-helm.Dockerfile -t fsteam/federatedscope:fs_helm .` * Pull from docker hub: `docker pull fsteam/federatedscope:fs_helm` * Download Helm evaluation dataset @@ -25,7 +25,7 @@ -v "${PATH_TO_FS}:/root/FederatedScope" \ -v "${PATH_TO_CACHE}:/root/.cache" \ -w '/root/FederatedScope' \ - --name "helm_fs" alibaba/federatedscope:fs_helm /bin/bash + --name "helm_fs" fsteam/federatedscope:fs_helm /bin/bash ``` Example for a root user: @@ -38,7 +38,7 @@ -v "/root/helm_fs/FederatedScope:/root/FederatedScope" \ -v "/root/.cache:/root/.cache" \ -w '/root/FederatedScope' \ - --name "helm_fs" alibaba/federatedscope:fs_helm /bin/bash + --name "helm_fs" fsteam/federatedscope:fs_helm /bin/bash ``` * Install FS in container @@ -94,4 +94,4 @@ Remark: For the second run of decapoda-research/llama-7b-hf, it not work, in ~/helm_fs/src/crfm-helm/data/decapoda-research--llama-7b-hf/snapshots/xxxx/tokenizer_config.json, change -"tokenizer_class": "LLaMATokenizer" -> "tokenizer_class": "LlamaTokenizer" \ No newline at end of file +"tokenizer_class": "LLaMATokenizer" -> "tokenizer_class": "LlamaTokenizer" From 6c74fb994c4ca68588c3b7f19b344d0a9732acc6 Mon Sep 17 00:00:00 2001 From: rayrayraykk <18007356109@163.com> Date: Thu, 6 Jul 2023 10:25:41 +0800 Subject: [PATCH 06/23] fix save bug --- federatedscope/core/auxiliaries/utils.py | 5 +++++ federatedscope/core/workers/server.py | 12 +++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/federatedscope/core/auxiliaries/utils.py b/federatedscope/core/auxiliaries/utils.py index e1014e8c8..06cfd24d1 100644 --- a/federatedscope/core/auxiliaries/utils.py +++ b/federatedscope/core/auxiliaries/utils.py @@ -177,3 +177,8 @@ def get_resource_info(filename): with open(filename, 'br') as f: device_info = pickle.load(f) return device_info + + +def add_prefix_to_path(path, prefix): + directory, file = os.path.split(path) + return os.path.join(dir, prefix + file) diff --git a/federatedscope/core/workers/server.py b/federatedscope/core/workers/server.py index 372a141d1..131d5ebbc 100644 --- a/federatedscope/core/workers/server.py +++ b/federatedscope/core/workers/server.py @@ -13,7 +13,7 @@ from federatedscope.core.auxiliaries.aggregator_builder import get_aggregator from federatedscope.core.auxiliaries.sampler_builder import get_sampler from federatedscope.core.auxiliaries.utils import merge_dict_of_results, \ - Timeout, merge_param_dict + Timeout, merge_param_dict, add_prefix_to_path from federatedscope.core.auxiliaries.trainer_builder import get_trainer from federatedscope.core.secret_sharing import AdditiveSecretSharing from federatedscope.core.workers.base_server import BaseServer @@ -405,7 +405,8 @@ def check_and_save(self): if self.state != self.total_round_num and \ self.state % self._cfg.federate.save_freq == 0 and \ self._cfg.federate.save_freq > 0: - path = f'{self.state}_' + self._cfg.federate.save_to + path = add_prefix_to_path(f'{self.state}_', + self._cfg.federate.save_to) self.aggregator.save_model(path, self.state) if should_stop or self.state == self.total_round_num: @@ -526,10 +527,11 @@ def save_best_results(self): """ To Save the best evaluation results. """ - + # Save final round model if self._cfg.federate.save_to != '': - self.aggregator.save_model(f'final_{self._cfg.federate.save_to}', - self.state) + self.aggregator.save_model( + add_prefix_to_path('final_', self._cfg.federate.save_to), + self.state) formatted_best_res = self._monitor.format_eval_res( results=self.best_results, rnd="Final", From 44782091babc327359d2ca450f3a6aaf6f699055 Mon Sep 17 00:00:00 2001 From: rayrayraykk <18007356109@163.com> Date: Thu, 6 Jul 2023 10:31:36 +0800 Subject: [PATCH 07/23] fix --- federatedscope/core/auxiliaries/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/federatedscope/core/auxiliaries/utils.py b/federatedscope/core/auxiliaries/utils.py index 06cfd24d1..f9b4f239e 100644 --- a/federatedscope/core/auxiliaries/utils.py +++ b/federatedscope/core/auxiliaries/utils.py @@ -181,4 +181,4 @@ def get_resource_info(filename): def add_prefix_to_path(path, prefix): directory, file = os.path.split(path) - return os.path.join(dir, prefix + file) + return os.path.join(directory, prefix + file) From 880d602b8b26e1c9d12331c260274a304a4a9b9e Mon Sep 17 00:00:00 2001 From: rayrayraykk <18007356109@163.com> Date: Thu, 6 Jul 2023 10:35:53 +0800 Subject: [PATCH 08/23] fix minor bugs --- federatedscope/core/auxiliaries/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/federatedscope/core/auxiliaries/utils.py b/federatedscope/core/auxiliaries/utils.py index f9b4f239e..4126dc710 100644 --- a/federatedscope/core/auxiliaries/utils.py +++ b/federatedscope/core/auxiliaries/utils.py @@ -179,6 +179,6 @@ def get_resource_info(filename): return device_info -def add_prefix_to_path(path, prefix): +def add_prefix_to_path(prefix, path): directory, file = os.path.split(path) return os.path.join(directory, prefix + file) From cdc17bba3b280296ff533a85a450b5f311e3d182 Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Thu, 6 Jul 2023 12:04:48 +0800 Subject: [PATCH 09/23] Fix share_local_model compatibility with model.half() (#660) --- federatedscope/core/workers/server.py | 2 ++ federatedscope/llm/trainer/trainer.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/federatedscope/core/workers/server.py b/federatedscope/core/workers/server.py index 131d5ebbc..4478e41e0 100644 --- a/federatedscope/core/workers/server.py +++ b/federatedscope/core/workers/server.py @@ -91,6 +91,8 @@ def __init__(self, if self._cfg.federate.share_local_model \ and not self._cfg.federate.process_num > 1: + if self._cfg.train.is_enable_half: + model = model.half() # put the model to the specified device model.to(device) # Build aggregator diff --git a/federatedscope/llm/trainer/trainer.py b/federatedscope/llm/trainer/trainer.py index 6914c66cb..3763d0ddc 100644 --- a/federatedscope/llm/trainer/trainer.py +++ b/federatedscope/llm/trainer/trainer.py @@ -85,6 +85,11 @@ def _hook_on_batch_forward_flop_count(self, ctx): ``ctx.monitor`` Track average flops ================================== =========================== """ + + # The process may occupy a large amount of video memory + # if the garbage collection is not triggered in time + # when there is plenty of video memory left. Set + # `eval.count_flops = False` to avoid this. if not isinstance(ctx.monitor, Monitor): logger.warning( f"The trainer {type(self)} does contain a valid monitor, " From 5b689189572fc4a68efebc671cc5f80bd1b32de5 Mon Sep 17 00:00:00 2001 From: qbc Date: Mon, 10 Jul 2023 11:18:40 +0800 Subject: [PATCH 10/23] Update readme for fshelm (#662) --- .../llm/eval/eval_for_helm/README.md | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/federatedscope/llm/eval/eval_for_helm/README.md b/federatedscope/llm/eval/eval_for_helm/README.md index 0eb827093..cb4960a51 100644 --- a/federatedscope/llm/eval/eval_for_helm/README.md +++ b/federatedscope/llm/eval/eval_for_helm/README.md @@ -8,7 +8,7 @@ * Download Helm evaluation dataset - * `wget https://${NOT_AVAILABLE_NOW}/helm_data.zip -O ${PATH_TO_HELM_DATA}/helm_data.zip` + * `wget https://federatedscope.oss-cn-beijing.aliyuncs.com/helm_data.zip -O ${PATH_TO_HELM_DATA}/helm_data.zip` * `unzip ${PATH_TO_HELM_DATA}/helm_data.zip` * Prepare FS and related `ckpt` and `yaml` @@ -51,10 +51,13 @@ * Start to evaluate - * `helm-run --conf-paths federatedscope/llm/eval/eval_for_helm/run_specs.conf --enable-local-huggingface-model decapoda-research/llama-7b-hf --suite ${SUITE_NAME} -m 100 --local -n 1` - - The above code will evaluate the model `decapoda-research/llama-7b-hf` and save the results in `${SUITE_NAME}`. - * If you want to test your own trained `ckpt` for `decapoda-research/llama-7b-hf`, please add parameters `--yaml /path/to/xxx.yaml` and `--ckpt_dir /dir/of/saved/ckpt` + * `helm-run --conf-paths federatedscope/llm/eval/eval_for_helm/run_specs.conf --enable-local-huggingface-model decapoda-research/llama-7b-hf --suite ${SUITE_NAME} -m 100 --local -n 1 --skip-completed-runs --local-path xxx` + * The above code will evaluate the model `decapoda-research/llama-7b-hf` and save the results in `/benchmark_output/runs/${SUITE_NAME}`. + * `-m 100` means that there will be 100 items in each task. + * `--skip-completed-runs` means that when restarted, it will skip the completed test sets. It is recommended to add this if you no dot want to waste your time for the completed tasks. + * `--local-path xxx` means the directory to put cache files, default value is `prod_env`. It will always use it when you run a new task. It is recommended that before running a new task, delete it or assign a new name to it. + * If you want to test your own trained `ckpt` for `decapoda-research/llama-7b-hf`, please add parameters `--yaml /path/to/xxx.yaml`. If you want to modify the configurations in `yaml`, just add parameters similar to the behaviors in FS. For example, add `federate.save_to xxxx.ckpt` to change the ckpt. +* Launch webserver to view results * `bash evaluaton/setup_server.sh -n ${SUITE_NAME} -p ${PORT}` Run the above code and view the results on port `${PORT}`. @@ -73,25 +76,25 @@ * `git clone -b dev/llm https://github.com/alibaba/FederatedScope.git` * `cd FederatedScope` * `pip install -e .[llm]` -* Unzip `helm_data.zip` and move data +* Download and unzip Helm evaluation dataset + * `wget https://federatedscope.oss-cn-beijing.aliyuncs.com/helm_data.zip -O ${PATH_TO_HELM_DATA}/helm_data.zip` + * `unzip ${PATH_TO_HELM_DATA}/helm_data.zip` +* Move files * `benchmark_output` -> `~/helm_fs/src/crfm-helm/benchmark_output` * `nltk_data` -> `~/nltk_data` * `prompt_construction_settings.json` - > `/tmp/prompt_construction_settings.json` - * In `~/helm_fs/src/crfm-helm/benchmark_output`, do `mkdir runs` * Move ckpt and yaml * Start to evaluate - * `helm-run --conf-paths federatedscope/llm/eval/eval_for_helm/run_specs.conf --enable-local-huggingface-model decapoda-research/llama-7b-hf --suite test -m 100 --local -n 1 --yaml federatedscope/llm/baseline/llama.yaml --ckpt_dir xxxx --skip-completed-runs --local-path xxx` - * If the program terminated due to network issues, --skip-completed-runs means that when restart, it will skip the completed test sets. It is recommended to add this all the time. - * --local-path xxx means the directory to put cache files, default value is prod_env. It will always use it when you run a new task. It is recommended that before running a new task, delete it or assign a new name to it. + * `helm-run --conf-paths federatedscope/llm/eval/eval_for_helm/run_specs.conf --enable-local-huggingface-model decapoda-research/llama-7b-hf --suite ${SUITE_NAME} -m 100 --local -n 1 --skip-completed-runs --local-path xxx` * Launch webserver to view results - * In ~/helm_fs/src/crfm-helm/evaluation/setup_server.sh, set - * `SUITE_NAME=${suite}` + * In `~/helm_fs/src/crfm-helm/evaluation/setup_server.sh`, set + * `SUITE_NAME=${SUITE_NAME}` * `PATH_HELM=~/helm_fs/src/crfm-helm` * `PATH_HELM=~/helm_fs/src/crfm-helm` * `root/miniconda3/bin/python -> ${which python}` - * `bash evaluation/setup_server.sh` + * `bash evaluation/setup_server.sh -n ${SUITE_NAME} -p ${PORT}` * Remark: Actually, it will show the result of the last task. If you want to see the result of another task, say, the suite name is result_of_exp1, add `?suite=result_of_exp1`after the port address. -Remark: For the second run of decapoda-research/llama-7b-hf, it not work, in ~/helm_fs/src/crfm-helm/data/decapoda-research--llama-7b-hf/snapshots/xxxx/tokenizer_config.json, change +Remark: For the second run of `decapoda-research/llama-7b-hf`, if not work, in ~/helm_fs/src/crfm-helm/data/decapoda-research--llama-7b-hf/snapshots/xxxx/tokenizer_config.json, change "tokenizer_class": "LLaMATokenizer" -> "tokenizer_class": "LlamaTokenizer" From f0c4e42cbb8751a3b5045a789b7c2d2dc0e55d62 Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Tue, 11 Jul 2023 12:06:21 +0800 Subject: [PATCH 11/23] README for LLM (#661) --- federatedscope/llm/README.md | 219 +++++++++++++++++++- federatedscope/llm/model/adapter_builder.py | 1 + 2 files changed, 219 insertions(+), 1 deletion(-) diff --git a/federatedscope/llm/README.md b/federatedscope/llm/README.md index 6b81d9cd0..70dc472df 100644 --- a/federatedscope/llm/README.md +++ b/federatedscope/llm/README.md @@ -1 +1,218 @@ -# TBD \ No newline at end of file +# FederatedScope-LLM + +FederatedScope-LLM (FS-LLM) is a unified, comprehensive and efficient package for federated large language model. We provide a hands-on tutorial here, while for more detailed tutorial, please refer to [TO-BE-RELEASED](). + +## Quick Start + +Let’s start with finetuning GPT-2 on [Alpaca](https://github.com/tatsu-lab/stanford_alpaca) to familiarize you with FS-LLM. + +### Step 1. Installation + +The installation of FS-LLM is similar to minimal FS, except that it requires **Pytorch>=1.13.0** (we recommend version 2.0.X) because of the [PEFT](https://github.com/huggingface/peft) dependency: + +```bash +# Create virtual environments with conda +conda create -n fs-llm python=3.9 +conda activate fs-llm + +# Install Pytorch>=1.13.0 (e.g., Pytorch==2.0.0) +conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia + +# Install FS-LLM with editable mode +pip install -e .[llm] +``` + +Now, you have successfully installed the FS-LLM. + +### Step 2. Run with exmaple config + +Now, we can fine-tune a GPT2 on Alpaca with FedAvg. + +```bash +python federatedscope/main.py --cfg federatedscope/llm/baseline/testcase.yaml +``` + +For more details about customized configurations, see **Advanced**. + +## Advanced + +### Start with built-in functions + +You can easily run through a customized `yaml` file. Here we only introduce the configuration related to FS-LLM, other configurations please refer to [Configurations](https://github.com/alibaba/FederatedScope/blob/master/federatedscope/core/configs/README.md). For more examples, please refer to `federatedscope/llm/baseline`. + +```yaml +# For this configuration, you might need a GPU with at least 32GB of video memory to run. + +# Whether to use GPU +use_gpu: True + +# Deciding which GPU to use +device: 0 + +# Early stop steps, set `0` to disable +early_stop: + patience: 0 + +# Federate learning related options +federate: + # `standalone` or `distributed` + mode: standalone + # Number of communication round + total_round_num: 500 + # Saving path for ckpt + save_to: "llama_rosetta_9_fed.ckpt" + # Number of dataset being split + client_num: 9 + # Enable for saving memory, all workers share the same model instance + share_local_model: True + +# Dataset related options +data: + # Root directory where the data stored + root: data/ + # Dataset name + type: 'rosetta_alpaca@llm' + # Train/val/test splits + splits: [0.89,0.1,0.01] + # Use meta inforamtion to split `rosetta_alpaca` + splitter: 'meta' + +# LLM related options +llm: + # Max token length for model input (training) + tok_len: 650 + # ChatBot related options + chat: + # Max token length for model input (inference) + max_len: 1000 + # Max number of history texts + max_history_len: 10 + # Path for store model cache, default in `~/.cache/` + cache: + model: '' + # PEFT related options + adapter: + # Set ture to enable PEFT finetuning + use: True + # Args for PEFT finetuning + args: [ { 'adapter_package': 'peft', 'adapter_method': 'lora', 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.1 } ] + +# DataLoader related options +dataloader: + # Batch size for iter loader + batch_size: 1 + +# Model related options +model: + # Model type (format: {MODEL_REPO}@huggingface_llm) + type: 'decapoda-research/llama-7b-hf@huggingface_llm' + +# Train related options +train: + # Number of local update steps + local_update_steps: 30 + # `batch` or `epoch` for local_update_steps + batch_or_epoch: batch + # Optimizer related options + optimizer: + # Learning rate + lr: 0.003 + # Weight decay + weight_decay: 0.0 + # Set ture to enable `model.half()` + is_enable_half: True + +# Trainer related options +trainer: + # Trainer type + type: llmtrainer + +# Evaluation related options +eval: + # Frequency of evaluation + freq: 50 + # Evaluation metrics + metrics: ['loss'] + # Set key to track best model + best_res_update_round_wise_key: val_loss +``` + +### DataZoo + +In general, we use instruction SFT following [Alpaca](https://github.com/tatsu-lab/stanford_alpaca) team. And in standalone mode, all dataset can be split into several clients with spesific `splitter` (i.e., `lda`, `meta`, `iid`) and `federate.num_client`. + +#### Built-in Data + +| data.type | Source | Note | +| --------------------- | ----------------------------------------------------- | --------------------------------------------------- | +| `alpaca@llm` | [Link](https://github.com/tatsu-lab/stanford_alpaca) | `IIDSplitter` | +| `alpaca_cleaned@llm` | [Link](https://github.com/gururise/AlpacaDataCleaned) | `IIDSplitter` | +| `dolly-15k@llm` | [Link](https://github.com/databrickslabs/dolly) | `LDASplitter` or `MetaSplitter` split to 8 clients. | +| `gsm8k@llm` | [Link](https://github.com/openai/grade-school-math) | `IIDSplitter` | +| `rosetta_alpaca@llm` | [Link](https://github.com/sahil280114/codealpaca) | `LDASplitter` or `MetaSplitter` split to 9 clients. | +| `code_search_net@llm` | [Link](https://github.com/github/CodeSearchNet) | `LDASplitter` or `MetaSplitter` split to 6 clients. | + +#### Self-maintained Data + +| data.type | Note | +| ------------------------- | ------------------------------------------------------------ | +| `YOU_DATA_NAME.json@llm` | Format: `[{'instruction': ..., 'input': ..., 'output':...}]`, default key: `instruction`, `input`, `output`, `category` | +| `YOU_DATA_NAME.jsonl@llm` | Format of each line: `{'instruction': ..., 'input': ..., 'output':...}`, default key: `instruction`, `input`, `output`, `category` | + +#### Evaluation tools + +We evaluate model domain capability of fine-tuned models with easy-to-use evaluation tools. + +```bash +FederatedScope +├── federatedscope +│ ├── llm +│ │ ├── eval +│ │ │ ├── eval_for_code +│ │ │ ├── eval_for_gsm8k +│ │ │ ├── eval_for_helm +│ │ │ ├── eval_for_mmlu +... +``` + +How to use: + +For example, to evaluate the model fine-tuned with `python federatedscope/main.py --cfg sft_gsm8k.yaml`, you can run `python federatedscope/llm/eval/eval_for_gsm8k/eval.py --cfg sft_gsm8k.yaml` in the `eval_for_gsm8k` directory. For other usages, please refer to the `README.md` file in each subdirectory. + +### AlgoZoo + +#### Parameter-Efficient Fine-Tuning + +With the help of parameter-efficient fine-tuning methods, federally fine-tuning a large model requires passing only a very small percentage of model parameters (adapters), making it possible for the client enable efficient adaptation of pre-trained language models to various downstream applications. We adopt [PEFT](https://github.com/huggingface/peft) for fine-tuning LLMs, and more methods are coming soon! + +| Methods | Source | Example for `llm.adapter.args` | +| ------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| LoRA | [Link](https://arxiv.org/abs/2106.09685) | `[ { 'adapter_package': 'peft', 'adapter_method': 'lora', 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.1 } ]` | +| Prefix Tuning | [Link](https://aclanthology.org/2021.acl-long.353/), [Link](https://arxiv.org/pdf/2110.07602.pdf) | `[{'adapter_package': 'peft', 'adapter_method': 'prefix', 'prefix_projection': False, 'num_virtual_tokens': 20}]` | +| P-Tuning | [Link](https://arxiv.org/abs/2103.10385) | `[{'adapter_package': 'peft', 'adapter_method': 'p-tuning', 'encoder_reparameterization_type': 'MLP', 'encoder_dropout': 0.1, 'num_virtual_tokens': 20}]` | +| Prompt Tuning | [Link](https://arxiv.org/abs/2104.08691) | `[{'adapter_package': 'peft', 'adapter_method': 'prompt', 'prompt_tuning_init': 'RANDOM', 'num_virtual_tokens': 20}]` | + +#### Federate fine-tune closed-source LLMs + +We support federated fine-tuning not only for open-source LLMs, but also for closed-source LLMs. In this scenario, clients can fine-tune LLMs without fully accessing the model, where models and data are both considered as privacy. + +| Methods | Source | How to enable | +| -------------- | ---------------------------------------- | ----------------------------- | +| Offsite-Tuning | [Link](https://arxiv.org/abs/2302.04870) | `llm.offsute_tuning.use=True` | + +#### Federate fine-tune with multi-card + +To make the federate fine-tuning efficient, we adopt a series of multi-card acceleration operators. + +| Methods | Source | How to use | Note | +| --------------------- | ------------------------------------------------------------ | -------------------------------- | --------------------------------------------- | +| torch.nn.DataParallel | [Link](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) | `cfg.train.data_para_dids=[0,1]` | - | +| DeepSpeed | [Link](https://github.com/microsoft/DeepSpeed) | Coming soon | Use `nvcc - V` to make sure `CUDA` installed. | + +## FAQ + +- `WARNING: Skip the batch due to the loss is NaN, it may be caused by exceeding the precision or invalid labels.` + - Possible reason 1: This is because `llm.tok_len` limits the input length, causing the label to be empty, which automatically skips that data. Setting a larger `llm.tok_len` can avoid this. + - Possible reason 2: Due to the enabling of `train.is_enable_half`, numerical overflow may occur. This usually happens when setting the `optimizer.type` to `Adam`, since the default `eps` is `1e-8` but `fp16` requires at least `1e-5`. +- `ValueError: Tokenizer class LLaMATokenizer does not exist or is not currently imported. ` + - This is a problem with `transformers`, you can fix it in your local file. Replace `LLaMATokenizer` with `LlamaTokenizer` in `PATH_TO_DATA_ROOT/MODEL_REPO/snapshots/..../tokenizer_config.json` diff --git a/federatedscope/llm/model/adapter_builder.py b/federatedscope/llm/model/adapter_builder.py index f25002cdf..43a9c2dc2 100644 --- a/federatedscope/llm/model/adapter_builder.py +++ b/federatedscope/llm/model/adapter_builder.py @@ -36,6 +36,7 @@ def enable_adapter(model, package, adapter, **kwargs): model = get_peft_model(model, peft_config) else: raise NotImplementedError + model.print_trainable_parameters() elif package == 'adapterhub': """ From bed8da91805ba2f6fbbf4b08563df65f7e9012c3 Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Wed, 12 Jul 2023 18:05:17 +0800 Subject: [PATCH 12/23] fix minor bugs in fschat(#663) --- federatedscope/llm/misc/fschat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/federatedscope/llm/misc/fschat.py b/federatedscope/llm/misc/fschat.py index 54411aaf0..c6578d6fe 100644 --- a/federatedscope/llm/misc/fschat.py +++ b/federatedscope/llm/misc/fschat.py @@ -52,9 +52,9 @@ def __init__(self, config): print(f"{error}, will use raw model.") if config.train.is_enable_half: - self.model.half().to(self.device) - else: - self.model.to(self.device) + self.model.half() + + self.model = self.model.to(self.device) self.model = self.model.eval() if torch.__version__ >= "2" and sys.platform != "win32": self.model = torch.compile(self.model) @@ -80,7 +80,7 @@ def predict(self, input_text, use_history=True, use_prompt=True): input_ids.extend(text_ids) input_ids = torch.tensor(input_ids).long() input_ids = input_ids.unsqueeze(0).to(self.device) - response = self.model.generate(input_ids, + response = self.model.generate(input_ids=input_ids, max_new_tokens=self.max_len, num_beams=4, no_repeat_ngram_size=2, From 31e707f87d725955b7d8e7a710ef12bd68673af7 Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Thu, 13 Jul 2023 11:08:45 +0800 Subject: [PATCH 13/23] fix share_local_model (#665) --- federatedscope/core/workers/server.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/federatedscope/core/workers/server.py b/federatedscope/core/workers/server.py index 4478e41e0..80230448a 100644 --- a/federatedscope/core/workers/server.py +++ b/federatedscope/core/workers/server.py @@ -678,11 +678,23 @@ def broadcast_model_para(self, self.models[model_idx_i]) skip_broadcast = self._cfg.federate.method in ["local", "global"] - if self.model_num > 1: - model_para = [{} if skip_broadcast else model.state_dict() - for model in self.models] + if self._cfg.federate.share_local_model and not \ + self._cfg.federate.online_aggr: + if self.model_num > 1: + model_para = [ + {} if skip_broadcast else copy.deepcopy(model.state_dict()) + for model in self.models + ] + else: + model_para = {} if skip_broadcast else copy.deepcopy( + self.models[0].state_dict()) else: - model_para = {} if skip_broadcast else self.models[0].state_dict() + if self.model_num > 1: + model_para = [{} if skip_broadcast else model.state_dict() + for model in self.models] + else: + model_para = {} if skip_broadcast else self.models[ + 0].state_dict() # quantization if msg_type == 'model_para' and not skip_broadcast and \ From 3a6a84425de60effade89af814bf088f642a82da Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Fri, 14 Jul 2023 11:57:42 +0800 Subject: [PATCH 14/23] Fix yaml and add warnings for count flops (#666) --- federatedscope/llm/README.md | 8 +++++--- .../llm/baseline/exp_yaml/alpaca/alpaca_federate.yaml | 3 ++- .../llm/baseline/exp_yaml/alpaca/alpaca_global.yaml | 3 ++- .../baseline/exp_yaml/alpaca/alpaca_local_client_1.yaml | 3 ++- .../baseline/exp_yaml/alpaca/alpaca_local_client_2.yaml | 3 ++- .../baseline/exp_yaml/alpaca/alpaca_local_client_3.yaml | 3 ++- .../llm/baseline/exp_yaml/csn/csn_federate.yaml | 3 ++- .../llm/baseline/exp_yaml/csn/csn_local_client_1.yaml | 3 ++- .../llm/baseline/exp_yaml/csn/csn_local_client_2.yaml | 3 ++- .../llm/baseline/exp_yaml/csn/csn_local_client_3.yaml | 3 ++- .../llm/baseline/exp_yaml/csn/csn_local_client_4.yaml | 3 ++- .../llm/baseline/exp_yaml/csn/csn_local_client_5.yaml | 3 ++- .../llm/baseline/exp_yaml/csn/csn_local_client_6.yaml | 3 ++- .../llm/baseline/exp_yaml/dolly_lda/dolly_federate.yaml | 3 ++- .../llm/baseline/exp_yaml/dolly_lda/dolly_global.yaml | 3 ++- .../baseline/exp_yaml/dolly_lda/dolly_local_client_1.yaml | 3 ++- .../baseline/exp_yaml/dolly_lda/dolly_local_client_2.yaml | 3 ++- .../baseline/exp_yaml/dolly_lda/dolly_local_client_3.yaml | 3 ++- .../baseline/exp_yaml/dolly_meta/dolly_meta_federate.yaml | 3 ++- .../baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml | 3 ++- .../exp_yaml/dolly_meta/dolly_meta_local_client_1.yaml | 3 ++- .../exp_yaml/dolly_meta/dolly_meta_local_client_2.yaml | 3 ++- .../exp_yaml/dolly_meta/dolly_meta_local_client_3.yaml | 3 ++- .../exp_yaml/dolly_meta/dolly_meta_local_client_4.yaml | 3 ++- .../exp_yaml/dolly_meta/dolly_meta_local_client_5.yaml | 3 ++- .../exp_yaml/dolly_meta/dolly_meta_local_client_6.yaml | 3 ++- .../exp_yaml/dolly_meta/dolly_meta_local_client_7.yaml | 3 ++- .../exp_yaml/dolly_meta/dolly_meta_local_client_8.yaml | 3 ++- .../llm/baseline/exp_yaml/gsm/gsm_federate.yaml | 3 ++- federatedscope/llm/baseline/exp_yaml/gsm/gsm_global.yaml | 3 ++- .../llm/baseline/exp_yaml/gsm/gsm_local_client_1.yaml | 3 ++- .../llm/baseline/exp_yaml/gsm/gsm_local_client_2.yaml | 3 ++- .../llm/baseline/exp_yaml/gsm/gsm_local_client_3.yaml | 3 ++- .../exp_yaml/rosetta_3_clients/rosetta_federate.yaml | 3 ++- .../rosetta_3_clients/rosetta_local_client_1.yaml | 3 ++- .../rosetta_3_clients/rosetta_local_client_2.yaml | 3 ++- .../rosetta_3_clients/rosetta_local_client_3.yaml | 3 ++- .../exp_yaml/rosetta_9_clients/rosetta_federate.yaml | 3 ++- .../exp_yaml/rosetta_9_clients/rosetta_global.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_1.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_2.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_3.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_4.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_5.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_6.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_7.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_8.yaml | 3 ++- .../rosetta_9_clients/rosetta_local_client_9.yaml | 3 ++- federatedscope/llm/baseline/llama.yaml | 3 ++- federatedscope/llm/trainer/trainer.py | 7 ++++++- 50 files changed, 107 insertions(+), 52 deletions(-) diff --git a/federatedscope/llm/README.md b/federatedscope/llm/README.md index 70dc472df..347d26c89 100644 --- a/federatedscope/llm/README.md +++ b/federatedscope/llm/README.md @@ -1,6 +1,6 @@ # FederatedScope-LLM -FederatedScope-LLM (FS-LLM) is a unified, comprehensive and efficient package for federated large language model. We provide a hands-on tutorial here, while for more detailed tutorial, please refer to [TO-BE-RELEASED](). +FederatedScope-LLM (FS-LLM) is an efficient package for federated large language model. We provide a hands-on tutorial here, while for more detailed tutorial, please refer to [TO-BE-RELEASED](). ## Quick Start @@ -137,7 +137,7 @@ eval: best_res_update_round_wise_key: val_loss ``` -### DataZoo +### Fine-tuning Datasets In general, we use instruction SFT following [Alpaca](https://github.com/tatsu-lab/stanford_alpaca) team. And in standalone mode, all dataset can be split into several clients with spesific `splitter` (i.e., `lda`, `meta`, `iid`) and `federate.num_client`. @@ -179,7 +179,7 @@ How to use: For example, to evaluate the model fine-tuned with `python federatedscope/main.py --cfg sft_gsm8k.yaml`, you can run `python federatedscope/llm/eval/eval_for_gsm8k/eval.py --cfg sft_gsm8k.yaml` in the `eval_for_gsm8k` directory. For other usages, please refer to the `README.md` file in each subdirectory. -### AlgoZoo +### Agorithms #### Parameter-Efficient Fine-Tuning @@ -216,3 +216,5 @@ To make the federate fine-tuning efficient, we adopt a series of multi-card acce - Possible reason 2: Due to the enabling of `train.is_enable_half`, numerical overflow may occur. This usually happens when setting the `optimizer.type` to `Adam`, since the default `eps` is `1e-8` but `fp16` requires at least `1e-5`. - `ValueError: Tokenizer class LLaMATokenizer does not exist or is not currently imported. ` - This is a problem with `transformers`, you can fix it in your local file. Replace `LLaMATokenizer` with `LlamaTokenizer` in `PATH_TO_DATA_ROOT/MODEL_REPO/snapshots/..../tokenizer_config.json` +- `OutOfMemoryError: CUDA out of memory.` + - Torch's garbage collection mechanism may not be timely resulting in OOM, please set `cfg.eval.count_flops` to `False`. diff --git a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_federate.yaml b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_federate.yaml index 6dd301f8e..02c80918d 100644 --- a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_federate.yaml +++ b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_federate.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_global.yaml b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_global.yaml index a321991fa..e462d9421 100644 --- a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_global.yaml +++ b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_global.yaml @@ -38,4 +38,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_1.yaml b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_1.yaml index f1dc7c5c8..00245c06c 100644 --- a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_1.yaml +++ b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_1.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_2.yaml b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_2.yaml index ca53e5ca9..3da3e25cc 100644 --- a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_2.yaml +++ b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_2.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_3.yaml b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_3.yaml index 6eca5a44f..ee0ed0ebc 100644 --- a/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_3.yaml +++ b/federatedscope/llm/baseline/exp_yaml/alpaca/alpaca_local_client_3.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/csn/csn_federate.yaml b/federatedscope/llm/baseline/exp_yaml/csn/csn_federate.yaml index f5ddcf938..f635da5f7 100644 --- a/federatedscope/llm/baseline/exp_yaml/csn/csn_federate.yaml +++ b/federatedscope/llm/baseline/exp_yaml/csn/csn_federate.yaml @@ -41,4 +41,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_1.yaml b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_1.yaml index e1bb3fcd6..a10e79a52 100644 --- a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_1.yaml +++ b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_1.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_2.yaml b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_2.yaml index c5bf32c3f..099958b9b 100644 --- a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_2.yaml +++ b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_2.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_3.yaml b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_3.yaml index e2fcc4ee7..87f9488ac 100644 --- a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_3.yaml +++ b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_3.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_4.yaml b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_4.yaml index d1dbca74f..ed40db98c 100644 --- a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_4.yaml +++ b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_4.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_5.yaml b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_5.yaml index 0e9157555..6fc48a0f1 100644 --- a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_5.yaml +++ b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_5.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_6.yaml b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_6.yaml index e0df9c151..d0d453760 100644 --- a/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_6.yaml +++ b/federatedscope/llm/baseline/exp_yaml/csn/csn_local_client_6.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_federate.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_federate.yaml index 083153159..65d4c6b35 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_federate.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_federate.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_global.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_global.yaml index 6c82036c3..456c73722 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_global.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_global.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_1.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_1.yaml index 23c340802..5f45664e8 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_1.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_1.yaml @@ -41,4 +41,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_2.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_2.yaml index 481b5a645..563c92793 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_2.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_2.yaml @@ -41,4 +41,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_3.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_3.yaml index e6e782299..4b28c1b50 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_3.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_lda/dolly_local_client_3.yaml @@ -41,4 +41,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_federate.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_federate.yaml index 98934d2d2..4270925b8 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_federate.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_federate.yaml @@ -40,4 +40,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml index 77bcf854e..b111c8425 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_global.yaml @@ -40,4 +40,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_1.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_1.yaml index d51ecd856..e692cd6b8 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_1.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_1.yaml @@ -41,4 +41,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_2.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_2.yaml index 16a4d7609..c92ceefbf 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_2.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_2.yaml @@ -41,4 +41,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_3.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_3.yaml index 0b90107dd..0032fd645 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_3.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_3.yaml @@ -41,4 +41,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_4.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_4.yaml index 1611c747d..0eeec992a 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_4.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_4.yaml @@ -41,4 +41,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_5.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_5.yaml index a85a5aa1e..c4bb920e9 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_5.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_5.yaml @@ -41,4 +41,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_6.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_6.yaml index 5f91916e7..9bf5c1d3d 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_6.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_6.yaml @@ -41,4 +41,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_7.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_7.yaml index ef2c41883..ffa63349b 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_7.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_7.yaml @@ -41,4 +41,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_8.yaml b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_8.yaml index db3553572..b05a14c46 100644 --- a/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_8.yaml +++ b/federatedscope/llm/baseline/exp_yaml/dolly_meta/dolly_meta_local_client_8.yaml @@ -41,4 +41,5 @@ eval: freq: 50 metrics: ['loss'] split: ['test'] - best_res_update_round_wise_key: test_loss \ No newline at end of file + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_federate.yaml b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_federate.yaml index 2bc6c0f67..bd3107867 100644 --- a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_federate.yaml +++ b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_federate.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_global.yaml b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_global.yaml index cd563f777..06f95533c 100644 --- a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_global.yaml +++ b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_global.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_1.yaml b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_1.yaml index 1e487e275..2f79851ad 100644 --- a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_1.yaml +++ b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_1.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_2.yaml b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_2.yaml index f29722ae1..3b05b3c3d 100644 --- a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_2.yaml +++ b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_2.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_3.yaml b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_3.yaml index 215cea648..326177312 100644 --- a/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_3.yaml +++ b/federatedscope/llm/baseline/exp_yaml/gsm/gsm_local_client_3.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_federate.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_federate.yaml index a5554a09d..f591ff02a 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_federate.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_federate.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_1.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_1.yaml index 60e640d48..3147af704 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_1.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_1.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_2.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_2.yaml index 00879ed8c..cc41f102a 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_2.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_2.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_3.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_3.yaml index 9cff1dff4..2ab223841 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_3.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_3_clients/rosetta_local_client_3.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_federate.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_federate.yaml index 856c5d7fb..8ee587ff0 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_federate.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_federate.yaml @@ -40,4 +40,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_global.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_global.yaml index fc253106a..66efca16d 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_global.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_global.yaml @@ -38,4 +38,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_1.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_1.yaml index 489283aa7..8bf9e27ae 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_1.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_1.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_2.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_2.yaml index e7cf82f86..a901f902b 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_2.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_2.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_3.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_3.yaml index ef18f83d5..bf5de2bb8 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_3.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_3.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_4.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_4.yaml index 0b68acd59..a85243737 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_4.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_4.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_5.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_5.yaml index 2ca7128b8..13c3a110b 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_5.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_5.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_6.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_6.yaml index 7c76c8505..d72211b55 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_6.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_6.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_7.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_7.yaml index f2893cd97..123741648 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_7.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_7.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_8.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_8.yaml index 5425d5712..9b32a891c 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_8.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_8.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_9.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_9.yaml index fe9d573b8..dd6176a9e 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_9.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_local_client_9.yaml @@ -39,4 +39,5 @@ trainer: eval: freq: 50 metrics: ['loss'] - best_res_update_round_wise_key: val_loss \ No newline at end of file + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/llama.yaml b/federatedscope/llm/baseline/llama.yaml index 8421a2574..a918522df 100644 --- a/federatedscope/llm/baseline/llama.yaml +++ b/federatedscope/llm/baseline/llama.yaml @@ -36,4 +36,5 @@ trainer: type: llmtrainer eval: freq: 50 - metrics: ['loss'] \ No newline at end of file + metrics: ['loss'] + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/trainer/trainer.py b/federatedscope/llm/trainer/trainer.py index 3763d0ddc..83d0b660c 100644 --- a/federatedscope/llm/trainer/trainer.py +++ b/federatedscope/llm/trainer/trainer.py @@ -115,7 +115,12 @@ def _hook_on_batch_forward_flop_count(self, ctx): ctx.model, inputs=(input_ids, attention_mask)).total() ctx.monitor.track_avg_flops(flops_one_batch, ctx.batch_size) except Exception as e: - logger.info(e) + logger.warning("When using count flops functions, torch's " + "garbage collection mechanism may not be " + "timely resulting in OOM, please set " + "`cfg.eval.count_flops` to `False` " + "to avoid error or warning like this.") + logger.error(e) # Raise warning at the first failure logger.warning( "current flop count implementation is for general LLM " From 281d9d255e31d559a6a1bcbc76176d9bf990eced Mon Sep 17 00:00:00 2001 From: Harli WU Date: Tue, 18 Jul 2023 18:33:06 -0700 Subject: [PATCH 15/23] Fix bugs for HumanEval (#667) --- federatedscope/llm/eval/eval_for_code/humaneval.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/federatedscope/llm/eval/eval_for_code/humaneval.py b/federatedscope/llm/eval/eval_for_code/humaneval.py index 8f1cebb0d..e6968ff4c 100644 --- a/federatedscope/llm/eval/eval_for_code/humaneval.py +++ b/federatedscope/llm/eval/eval_for_code/humaneval.py @@ -31,11 +31,15 @@ def pad_spaces(s, num=4): s = " " * num + s[n:] return s - # 1. remove everything after "\n\n" - code = code.split("\n\n")[0] - # 2. remove everything after the "def " - code = code.split("def ")[0] - # 3. pad to four space to avoid `unindent` error + # 1. remove the special char \u00a0 + code = code.replace('\u00a0', '') + # # 2. remove everything after "\n\n" + # code = code.split("\n\n")[0] + # 3. remove everything after the following stop sequences + # Reference: https://github.com/openai/human-eval + for stop_seq in ['\nclass', '\ndef', '\n#', '\nif', '\nprint', '\nassert']: + code = code.split(stop_seq)[0] + # 4. pad to four space to avoid `unindent` error code = pad_spaces(code, 4) return code From 1073864a0b5b8173e22ce9156ea0ce869d034259 Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:09:18 +0800 Subject: [PATCH 16/23] reimplement pFedme (#669) --- federatedscope/core/configs/cfg_fl_setting.py | 1 + federatedscope/core/trainers/base_trainer.py | 3 + .../core/trainers/trainer_pFedMe.py | 55 +++++++++++-------- federatedscope/core/workers/client.py | 16 ++++-- 4 files changed, 49 insertions(+), 26 deletions(-) diff --git a/federatedscope/core/configs/cfg_fl_setting.py b/federatedscope/core/configs/cfg_fl_setting.py index ec88462c2..676082247 100644 --- a/federatedscope/core/configs/cfg_fl_setting.py +++ b/federatedscope/core/configs/cfg_fl_setting.py @@ -40,6 +40,7 @@ def extend_fl_setting_cfg(cfg): cfg.federate.restore_from = '' cfg.federate.save_to = '' cfg.federate.save_freq = -1 + cfg.federate.save_client_model = False cfg.federate.join_in_info = [ ] # The information requirements (from server) for join_in cfg.federate.sampler = 'uniform' # the strategy for sampling client diff --git a/federatedscope/core/trainers/base_trainer.py b/federatedscope/core/trainers/base_trainer.py index 1d0637d42..9a7bb0a4a 100644 --- a/federatedscope/core/trainers/base_trainer.py +++ b/federatedscope/core/trainers/base_trainer.py @@ -33,3 +33,6 @@ def print_trainer_meta_info(self): meta_info = tuple([(val.name, getattr(self, val.name)) for val in sign]) return f'{self.__class__.__name__}{meta_info}' + + def save_model(self, path, cur_round): + raise NotImplementedError diff --git a/federatedscope/core/trainers/trainer_pFedMe.py b/federatedscope/core/trainers/trainer_pFedMe.py index dac1e81f0..cf069324f 100644 --- a/federatedscope/core/trainers/trainer_pFedMe.py +++ b/federatedscope/core/trainers/trainer_pFedMe.py @@ -1,10 +1,24 @@ import copy +try: + import torch +except ImportError: + torch = None from federatedscope.core.trainers.torch_trainer import GeneralTorchTrainer from federatedscope.core.optimizer import wrap_regularized_optimizer from typing import Type +def get_trainable_parameter_list(model): + copied_param = [] + for param in model.parameters(): + if param.requires_grad: + copied_param.append(copy.deepcopy(param)) + else: + copied_param.append(None) + return copied_param + + def wrap_pFedMeTrainer( base_trainer: Type[GeneralTorchTrainer]) -> Type[GeneralTorchTrainer]: """ @@ -81,7 +95,7 @@ def init_pFedMe_ctx(base_trainer): # the local_model_tmp is used to be the referenced parameter when # finding the approximate \theta in paper # will be copied from model every run_routine - ctx.pFedMe_local_model_tmp = None + ctx.pFedMe_local_model_param_tmp = None def _hook_on_fit_start_set_local_para_tmp(ctx): @@ -95,7 +109,7 @@ def _hook_on_fit_start_set_local_para_tmp(ctx): ``wrap_regularized_optimizer`` and set compared parameter group ``ctx.pFedMe_outer_lr`` Initialize to \ ``ctx.cfg.train.optimizer.lr`` - ``ctx.pFedMe_local_model_tmp`` Copy from ``ctx.model`` + ``ctx.pFedMe_local_model_param_tmp`` Copy from ``ctx.model`` ================================== =========================== """ # the optimizer used in pFedMe is based on Moreau Envelopes regularization @@ -106,13 +120,10 @@ def _hook_on_fit_start_set_local_para_tmp(ctx): for g in ctx.optimizer.param_groups: g['lr'] = ctx.cfg.personalization.lr ctx.pFedMe_outer_lr = ctx.cfg.train.optimizer.lr - - ctx.pFedMe_local_model_tmp = copy.deepcopy(ctx.model) + ctx.pFedMe_local_model_param_tmp = get_trainable_parameter_list(ctx.model) # set the compared model data, then the optimizer will find approximate # model using trainer.cfg.personalization.lr - compared_global_model_para = [{ - "params": list(ctx.pFedMe_local_model_tmp.parameters()) - }] + compared_global_model_para = [{"params": ctx.pFedMe_local_model_param_tmp}] ctx.optimizer.set_compared_para_group(compared_global_model_para) @@ -181,23 +192,22 @@ def _hook_on_epoch_end_update_local(ctx): Attribute Operation ================================== =========================== ``ctx.model`` Update parameters by \ - ``ctx.pFedMe_local_model_tmp`` + ``ctx.pFedMe_local_model_param_tmp`` ``ctx.optimizer`` Set compared parameter group ================================== =========================== """ # update local weight after finding approximate theta - for client_param, local_para_tmp in zip( - ctx.model.parameters(), ctx.pFedMe_local_model_tmp.parameters()): - local_para_tmp.data = local_para_tmp.data - \ - ctx.optimizer.regular_weight * \ - ctx.pFedMe_outer_lr * (local_para_tmp.data - - client_param.data) + for client_param, local_para_tmp in zip(ctx.model.parameters(), + ctx.pFedMe_local_model_param_tmp): + if client_param.requires_grad: + local_para_tmp.data = local_para_tmp.data - \ + ctx.optimizer.regular_weight * \ + ctx.pFedMe_outer_lr * (local_para_tmp.data - + client_param.data) # set the compared model data, then the optimizer will find approximate # model using trainer.cfg.personalization.lr - compared_global_model_para = [{ - "params": list(ctx.pFedMe_local_model_tmp.parameters()) - }] + compared_global_model_para = [{"params": ctx.pFedMe_local_model_param_tmp}] ctx.optimizer.set_compared_para_group(compared_global_model_para) @@ -209,12 +219,13 @@ def _hook_on_fit_end_update_local(ctx): Attribute Operation ================================== =========================== ``ctx.model`` Update parameters by - ``ctx.pFedMe_local_model_tmp`` - ``ctx.pFedMe_local_model_tmp`` Delete + ``ctx.pFedMe_local_model_param_tmp`` + ``ctx.pFedMe_local_model_param_tmp`` Delete ================================== =========================== """ for param, local_para_tmp in zip(ctx.model.parameters(), - ctx.pFedMe_local_model_tmp.parameters()): - param.data = local_para_tmp.data + ctx.pFedMe_local_model_param_tmp): + if param.requires_grad: + param.data = local_para_tmp.data - del ctx.pFedMe_local_model_tmp + del ctx.pFedMe_local_model_param_tmp diff --git a/federatedscope/core/workers/client.py b/federatedscope/core/workers/client.py index a40f3f10d..8afc5a26f 100644 --- a/federatedscope/core/workers/client.py +++ b/federatedscope/core/workers/client.py @@ -10,7 +10,7 @@ from federatedscope.core.auxiliaries.trainer_builder import get_trainer from federatedscope.core.secret_sharing import AdditiveSecretSharing from federatedscope.core.auxiliaries.utils import merge_dict_of_results, \ - calculate_time_cost + calculate_time_cost, add_prefix_to_path from federatedscope.core.workers.base_client import BaseClient logger = logging.getLogger(__name__) @@ -551,9 +551,17 @@ def callback_funcs_for_evaluate(self, message: Message): forms=['raw'], return_raw=True) logger.info(formatted_eval_res) - self._monitor.update_best_result(self.best_results, - formatted_eval_res['Results_raw'], - results_type=f"client #{self.ID}") + update_best_this_round = self._monitor.update_best_result( + self.best_results, + formatted_eval_res['Results_raw'], + results_type=f"client #{self.ID}", + ) + + if update_best_this_round and self._cfg.federate.save_client_model: + path = add_prefix_to_path(f'client_{self.ID}_', + self._cfg.federate.save_to) + self.trainer.save_model(path, self.state) + self.history_results = merge_dict_of_results( self.history_results, formatted_eval_res['Results_raw']) self.early_stopper.track_and_check(self.history_results[ From 0aad31ebaf18680cc820efd719630a906dfdf7fd Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Mon, 24 Jul 2023 16:50:31 +0900 Subject: [PATCH 17/23] Kd alignment for Offsite-tuning (#668) --- federatedscope/core/configs/cfg_llm.py | 34 ++- federatedscope/core/workers/server.py | 3 +- .../offsite_tuning/dolly/dolly_fed.yaml | 60 +++++ .../exp_yaml/offsite_tuning/gsm/gsm_fed.yaml | 59 +++++ .../offsite_tuning/rosetta/rosetta_fed.yaml | 60 +++++ .../llm/baseline/llama_offsite_align.yaml | 54 +++++ federatedscope/llm/misc/fschat.py | 18 +- federatedscope/llm/model/adapter_builder.py | 5 + .../llm/offsite_tuning/kd_trainer.py | 94 ++++++++ federatedscope/llm/offsite_tuning/server.py | 52 +++-- federatedscope/llm/offsite_tuning/utils.py | 208 ++++++++++++++++++ federatedscope/llm/trainer/trainer.py | 9 + 12 files changed, 623 insertions(+), 33 deletions(-) create mode 100644 federatedscope/llm/baseline/exp_yaml/offsite_tuning/dolly/dolly_fed.yaml create mode 100644 federatedscope/llm/baseline/exp_yaml/offsite_tuning/gsm/gsm_fed.yaml create mode 100644 federatedscope/llm/baseline/exp_yaml/offsite_tuning/rosetta/rosetta_fed.yaml create mode 100644 federatedscope/llm/baseline/llama_offsite_align.yaml create mode 100644 federatedscope/llm/offsite_tuning/kd_trainer.py diff --git a/federatedscope/core/configs/cfg_llm.py b/federatedscope/core/configs/cfg_llm.py index 98aff1eba..39a20920e 100644 --- a/federatedscope/core/configs/cfg_llm.py +++ b/federatedscope/core/configs/cfg_llm.py @@ -32,6 +32,9 @@ def extend_llm_cfg(cfg): cfg.llm.adapter = CN() cfg.llm.adapter.use = False cfg.llm.adapter.args = [{}] + # Move adapter to `cpu` after training, which can save memory but cost + # more time. + cfg.llm.adapter.mv_to_cpu = False # ---------------------------------------------------------------------- # # Offsite-tuning related options @@ -43,9 +46,38 @@ def extend_llm_cfg(cfg): cfg.llm.offsite_tuning.emu_l = 1 # Index of emulator layer left cfg.llm.offsite_tuning.emu_r = 10 # Index of emulator layer right + # Used in `eval` + cfg.llm.offsite_tuning.eval_type = 'emu' # Choose one of `[emu, full]` + + # Emulator alignment will use dataset in Server + cfg.llm.offsite_tuning.emu_align = CN() + cfg.llm.offsite_tuning.emu_align.use = False + cfg.llm.offsite_tuning.emu_align.restore_from = '' + cfg.llm.offsite_tuning.emu_align.save_to = '' + + # Server held-out data + cfg.llm.offsite_tuning.emu_align.data = CN() + cfg.llm.offsite_tuning.emu_align.data.root = 'data' + cfg.llm.offsite_tuning.emu_align.data.type = 'alpaca@llm' + cfg.llm.offsite_tuning.emu_align.data.splits = [0.8, 0.1, 0.1] + + cfg.llm.offsite_tuning.emu_align.train = CN() + cfg.llm.offsite_tuning.emu_align.train.local_update_steps = 10 + cfg.llm.offsite_tuning.emu_align.train.batch_or_epoch = 'batch' + cfg.llm.offsite_tuning.emu_align.train.lm_loss_weight = 0.1 + cfg.llm.offsite_tuning.emu_align.train.kd_loss_weight = 0.9 + + cfg.llm.offsite_tuning.emu_align.train.optimizer = CN(new_allowed=True) + cfg.llm.offsite_tuning.emu_align.train.optimizer.type = 'SGD' + cfg.llm.offsite_tuning.emu_align.train.optimizer.lr = 0.01 + def assert_llm_cfg(cfg): - pass + if cfg.llm.offsite_tuning.emu_align.use: + if cfg.llm.offsite_tuning.emu_align.restore_from != '': + logger.warning( + 'Enabling `restore_from` in offsite_tuning emulator ' + 'alignment will skip training the emulator.') register_config("llm", extend_llm_cfg) diff --git a/federatedscope/core/workers/server.py b/federatedscope/core/workers/server.py index 80230448a..afa43a730 100644 --- a/federatedscope/core/workers/server.py +++ b/federatedscope/core/workers/server.py @@ -107,7 +107,8 @@ def __init__(self, f' {self._cfg.federate.restore_from}.') else: _ = self.aggregator.load_model(self._cfg.federate.restore_from) - logger.info("Restored the model from {}-th round's ckpt") + logger.info(f"Restored the model from " + f"{self._cfg.federate.restore_from}") if int(config.model.model_num_per_trainer) != \ config.model.model_num_per_trainer or \ diff --git a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/dolly/dolly_fed.yaml b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/dolly/dolly_fed.yaml new file mode 100644 index 000000000..d5424e2c9 --- /dev/null +++ b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/dolly/dolly_fed.yaml @@ -0,0 +1,60 @@ +use_gpu: True +device: 0 +early_stop: + patience: 0 +federate: + mode: standalone + client_num: 8 + total_round_num: 500 + save_to: "llama_dolly_fed_ot.ckpt" + save_freq: -1 + share_local_model: True + online_aggr: False +data: + root: data/ + type: 'dolly-15k@llm' + splits: [0.99, 0.0, 0.01] + splitter: 'meta' +llm: + tok_len: 650 + chat: + max_len: 1000 + adapter: + mv_to_cpu: True + offsite_tuning: + use: True + eval_type: 'emu' + kwargs: [ { "drop_ratio": 0.2 } ] + emu_l: 2 + emu_r: 30 + emu_align: + use: True + restore_from: 'aligned_llama_dolly_fed_ot.ckpt' + save_to: 'aligned_llama_dolly_fed_ot.ckpt' + train: + local_update_steps: 500 + batch_or_epoch: 'batch' + lm_loss_weight: 0.1 + kd_loss_weight: 0.9 + optimizer: + lr: 0.001 +dataloader: + batch_size: 1 +model: + type: 'decapoda-research/llama-7b-hf@huggingface_llm' +train: + local_update_steps: 30 + batch_or_epoch: batch + optimizer: + lr: 0.005 + is_enable_half: True +criterion: + type: CrossEntropyLoss +trainer: + type: llmtrainer +eval: + freq: 50 + metrics: ['loss'] + split: ['test'] + best_res_update_round_wise_key: test_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/gsm/gsm_fed.yaml b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/gsm/gsm_fed.yaml new file mode 100644 index 000000000..012a454e3 --- /dev/null +++ b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/gsm/gsm_fed.yaml @@ -0,0 +1,59 @@ +use_gpu: True +device: 0 +early_stop: + patience: 0 +federate: + mode: standalone + client_num: 3 + total_round_num: 500 + save_to: "llama_gsm_fed_ot.ckpt" + save_freq: -1 + share_local_model: True + online_aggr: False +data: + root: data/ + type: 'gsm8k@llm' + splits: [0.998,0.001,0.001] + splitter: 'iid' +llm: + tok_len: 1000 + adapter: + mv_to_cpu: True + chat: + max_len: 1000 + offsite_tuning: + use: True + eval_type: 'emu' + kwargs: [{"drop_ratio": 0.2}] + emu_l: 2 + emu_r: 30 + emu_align: + use: True + restore_from: 'aligned_llama_gsm_fed_ot.ckpt' + save_to: 'aligned_llama_gsm_fed_ot.ckpt' + train: + local_update_steps: 500 + batch_or_epoch: 'batch' + lm_loss_weight: 0.1 + kd_loss_weight: 0.9 + optimizer: + lr: 0.001 +dataloader: + batch_size: 1 +model: + type: 'decapoda-research/llama-7b-hf@huggingface_llm' +train: + local_update_steps: 30 + batch_or_epoch: batch + optimizer: + lr: 0.005 + is_enable_half: True +criterion: + type: CrossEntropyLoss +trainer: + type: llmtrainer +eval: + freq: 50 + metrics: ['loss'] + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/rosetta/rosetta_fed.yaml b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/rosetta/rosetta_fed.yaml new file mode 100644 index 000000000..d10cdf132 --- /dev/null +++ b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/rosetta/rosetta_fed.yaml @@ -0,0 +1,60 @@ +use_gpu: True +device: 0 +early_stop: + patience: 0 +federate: + mode: standalone + client_num: 9 + total_round_num: 500 + save_to: "llama_rosetta_fed_ot.ckpt" + save_freq: -1 + share_local_model: True + online_aggr: False +data: + root: data/ + type: 'rosetta_alpaca@llm' + splits: [0.89,0.1,0.01] + splitter: 'meta' +llm: + tok_len: 650 + chat: + max_len: 1000 + adapter: + mv_to_cpu: True + offsite_tuning: + use: True + eval_type: 'emu' + kwargs: [ { "drop_ratio": 0.2 } ] + emu_l: 2 + emu_r: 30 + emu_align: + use: True + restore_from: 'aligned_llama_rosetta_fed_ot.ckpt' + save_to: 'aligned_llama_rosetta_fed_ot.ckpt' + train: + local_update_steps: 500 + batch_or_epoch: 'batch' + lm_loss_weight: 0.1 + kd_loss_weight: 0.9 + optimizer: + lr: 0.001 +dataloader: + batch_size: 1 +model: + type: 'decapoda-research/llama-7b-hf@huggingface_llm' +train: + local_update_steps: 30 + batch_or_epoch: batch + optimizer: + lr: 0.003 + weight_decay: 0.0 + is_enable_half: True +criterion: + type: CrossEntropyLoss +trainer: + type: llmtrainer +eval: + freq: 50 + metrics: ['loss'] + best_res_update_round_wise_key: val_loss + count_flops: False \ No newline at end of file diff --git a/federatedscope/llm/baseline/llama_offsite_align.yaml b/federatedscope/llm/baseline/llama_offsite_align.yaml new file mode 100644 index 000000000..f84596cf5 --- /dev/null +++ b/federatedscope/llm/baseline/llama_offsite_align.yaml @@ -0,0 +1,54 @@ +use_gpu: True +device: 1 +early_stop: + patience: 10 +federate: + mode: standalone + client_num: 1 + total_round_num: 20 + save_to: "llama.offsite_tuning.ckpt" + share_local_model: True + online_aggr: False +data: + root: data/ + type: 'alpaca@llm' + splits: [0.98,0.01,0.01] + splitter: 'iid' +llm: + tok_len: 1000 + chat: + max_len: 1000 + offsite_tuning: + use: True + emu_l: 2 + emu_r: 30 + emu_align: + use: True + restore_from: 'aligned_emulator.ckpt' + save_to: 'aligned_emulator.ckpt' + train: + local_update_steps: 10 + batch_or_epoch: 'batch' + lm_loss_weight: 0.1 + kd_loss_weight: 0.9 + optimizer: + lr: 0.01 +dataloader: + batch_size: 1 +model: + type: 'decapoda-research/llama-7b-hf@huggingface_llm' +train: + local_update_steps: 10 + batch_or_epoch: batch + optimizer: + lr: 0.0001 + weight_decay: 0.0 +# is_enable_half: True +criterion: + type: CrossEntropyLoss +trainer: + type: llmtrainer +eval: + freq: 10 + metrics: ['loss'] + best_res_update_round_wise_key: 'val_loss' \ No newline at end of file diff --git a/federatedscope/llm/misc/fschat.py b/federatedscope/llm/misc/fschat.py index c6578d6fe..90c6f98d3 100644 --- a/federatedscope/llm/misc/fschat.py +++ b/federatedscope/llm/misc/fschat.py @@ -10,8 +10,6 @@ from federatedscope.llm.dataloader.dataloader import get_tokenizer from federatedscope.llm.model.model_builder import get_llm from federatedscope.llm.dataset.llm_dataset import PROMPT_DICT -from federatedscope.llm.offsite_tuning.utils import \ - generate_emulator_and_adapter from federatedscope.core.auxiliaries.utils import setup_seed from federatedscope.core.auxiliaries.logging import update_logger @@ -25,19 +23,9 @@ def __init__(self, config): config.llm.tok_len) self.model = get_llm(config) if config.llm.offsite_tuning.use: - logger.info('===============use offsite tuning===============') - # We use offsite-tuning in this experiment - # Use adapter model instead - compress_strategy = config.llm.offsite_tuning.strategy - emulator_l = config.llm.offsite_tuning.emu_l - emulator_r = config.llm.offsite_tuning.emu_r - offsite_tuning_kwargs = config.llm.offsite_tuning.kwargs[0] - self.model = \ - generate_emulator_and_adapter(self.model, - strategy=compress_strategy, - emulator_l=emulator_l, - emulator_r=emulator_r, - **offsite_tuning_kwargs) + from federatedscope.llm.offsite_tuning.utils import \ + wrap_offsite_tuning_for_eval + self.model = wrap_offsite_tuning_for_eval(self.model, config) self.device = f'cuda:{config.device}' self.add_special_tokens = True diff --git a/federatedscope/llm/model/adapter_builder.py b/federatedscope/llm/model/adapter_builder.py index 43a9c2dc2..d1621c85c 100644 --- a/federatedscope/llm/model/adapter_builder.py +++ b/federatedscope/llm/model/adapter_builder.py @@ -1,3 +1,4 @@ +import torch import torch.nn as nn from collections import OrderedDict @@ -166,6 +167,10 @@ def get_trainable_state_dict(self): new_state_dict[k] = v return new_state_dict + def save_model(self, path, state=0): + ckpt = {'cur_round': state, 'model': self.model.state_dict()} + torch.save(ckpt, path) + # TODO: Fix `__getattr__` # def __getattr__(self, item): # return getattr(self.model, item) diff --git a/federatedscope/llm/offsite_tuning/kd_trainer.py b/federatedscope/llm/offsite_tuning/kd_trainer.py new file mode 100644 index 000000000..b5575a84b --- /dev/null +++ b/federatedscope/llm/offsite_tuning/kd_trainer.py @@ -0,0 +1,94 @@ +import torch +import logging +from federatedscope.llm.trainer.trainer import LLMTrainer +from federatedscope.core.trainers.context import CtxVar +from federatedscope.core.trainers.enums import LIFECYCLE + +logger = logging.getLogger(__name__) + + +def get_kd_loss(raw_model, adap_model): + """ + This function is borrowed from offsite-tuning: + https://github.com/mit-han-lab/offsite-tuning/blob/main/offsite_tuning + /utils.py + """ + kwargs = adap_model.student_l.input_kwargs + args = adap_model.student_l.input_args + output_teacher = args[0] + args = list(args[1:]) + args = tuple(args) + + with torch.no_grad(): + raw_model.teacher.eval() + for teacher_layer in raw_model.teacher: + output_teacher = teacher_layer(output_teacher, *args, **kwargs) + if isinstance(output_teacher, tuple): + output_teacher = output_teacher[0] + + output_student = adap_model.student_r.cached_output.float() + output_teacher = output_teacher.float() + + std = output_teacher.pow(2).mean().sqrt() + kd_loss = (output_teacher - output_student).div(std).pow(2).mean() + return kd_loss + + +class KDTrainer(LLMTrainer): + def __init__(self, + raw_model, + adapter_model, + data, + device, + config, + only_for_eval=False, + monitor=None): + super(KDTrainer, self).__init__(adapter_model, data, device, config, + only_for_eval, monitor) + self.ctx.raw_model = raw_model.to(device) + self.lm_loss_weight = \ + config.llm.offsite_tuning.emu_align.train.lm_loss_weight + self.kd_loss_weight = \ + config.llm.offsite_tuning.emu_align.train.kd_loss_weight + + def _hook_on_fit_start_numerical_precision(self, ctx): + super(KDTrainer, self)._hook_on_fit_start_numerical_precision(ctx) + if self.cfg.train.is_enable_half: + ctx.raw_model = ctx.raw_model.half() + + def train(self, target_data_split_name="train", hooks_set=None): + num_samples, model_para_all, eval_metrics = \ + super(KDTrainer, self).train(target_data_split_name, hooks_set) + logger.info("Finish alignment, move raw model to cpu.") + self.ctx.raw_model.cpu() + return num_samples, model_para_all, eval_metrics + + def _hook_on_batch_forward(self, ctx): + input_ids = ctx.data_batch['input_ids'].to(ctx.device) + labels = ctx.data_batch['labels'].to(ctx.device) + attention_mask = ctx.data_batch['attention_mask'].to(ctx.device) + + outputs = ctx.model(input_ids=input_ids, + labels=labels, + attention_mask=attention_mask) + + logits = outputs.logits + kd_loss = self.kd_loss_weight * get_kd_loss(ctx.raw_model, ctx.model) + lm_loss = self.lm_loss_weight * outputs.loss + loss = kd_loss + lm_loss + + if torch.isnan(loss): + ctx.skip_this_batch = CtxVar(True, LIFECYCLE.BATCH) + logger.warning('Skip the batch due to the loss is NaN, ' + 'it may be caused by exceeding the precision or ' + 'invalid labels.') + else: + ctx.skip_this_batch = CtxVar(False, LIFECYCLE.BATCH) + + ctx.y_true = CtxVar(labels, LIFECYCLE.BATCH) + ctx.y_prob = CtxVar(logits, LIFECYCLE.BATCH) + + ctx.loss_batch = CtxVar(loss, LIFECYCLE.BATCH) + ctx.batch_size = CtxVar(len(labels), LIFECYCLE.BATCH) + + logger.info(f'lm_loss: {lm_loss.item()}, kd loss: {kd_loss.item()}') diff --git a/federatedscope/llm/offsite_tuning/server.py b/federatedscope/llm/offsite_tuning/server.py index ddd918a1a..f0f13b41a 100644 --- a/federatedscope/llm/offsite_tuning/server.py +++ b/federatedscope/llm/offsite_tuning/server.py @@ -3,11 +3,12 @@ from federatedscope.core.message import Message from federatedscope.core.auxiliaries.utils import b64serializer, \ merge_dict_of_results +from federatedscope.core.monitors.monitor import Monitor from federatedscope.core.auxiliaries.trainer_builder import get_trainer from federatedscope.core.workers.server import Server from federatedscope.llm.offsite_tuning.utils import \ - generate_emulator_and_adapter + generate_emulator_and_adapter, align_student_with_teacher logger = logging.getLogger(__name__) @@ -39,6 +40,16 @@ def __init__(self, emulator_l=emulator_l, emulator_r=emulator_r, **offsite_tuning_kwargs) + # Emulator alignment + if config.llm.offsite_tuning.emu_align.use: + adap_model = align_student_with_teacher(raw_model=model, + adap_model=adap_model, + cfg=config, + device=device, + monitor=Monitor( + config, + monitored_object=self)) + self.raw_model = model super(OffsiteTuningServer, self).__init__(ID, state, config, data, adap_model, client_num, @@ -48,7 +59,9 @@ def __init__(self, device=self.device, config=self._cfg, only_for_eval=True, - monitor=self._monitor) + monitor=Monitor( + self._cfg, + monitored_object=self)) def trigger_for_feat_engr(self, trigger_train_func, @@ -80,18 +93,23 @@ def trigger_for_feat_engr(self, def eval(self): # Update the raw model with the new adapters - new_raw_model_state_dict = self.raw_model.state_dict() - for key, value in zip(self.raw_model.state_dict().keys(), - self.model.state_dict().values()): - new_raw_model_state_dict[key] = value - self.raw_model_trainer.update(new_raw_model_state_dict, strict=False) - # make the evaluation on raw model at the server first - raw_metrics = {} - for split in self._cfg.eval.split: - metrics = self.raw_model_trainer.evaluate( - target_data_split_name=split) - for key, value in metrics.items(): - raw_metrics['plugin.' + key] = value + if self._cfg.llm.offsite_tuning.eval_type == 'full': + self.model.to('cpu') + new_raw_model_state_dict = self.raw_model.state_dict() + for key, value in zip(self.raw_model.state_dict().keys(), + self.model.state_dict().values()): + new_raw_model_state_dict[key] = value + self.raw_model_trainer.update(new_raw_model_state_dict, + strict=False) + # make the evaluation on raw model at the server first + raw_metrics = {} + for split in self._cfg.eval.split: + metrics = self.raw_model_trainer.evaluate( + target_data_split_name=split) + for key, value in metrics.items(): + raw_metrics['plugin.' + key] = value + # Move to cpu + self.raw_model.to('cpu') if self._cfg.federate.make_global_eval: # By default, the evaluation is conducted one-by-one for all @@ -124,7 +142,8 @@ def eval(self): self.check_and_save() else: super().eval() - self.raw_metrics = raw_metrics + if self._cfg.llm.offsite_tuning.eval_type == 'full': + self.raw_metrics = raw_metrics def callback_funcs_for_metrics(self, message: Message): """ @@ -148,6 +167,7 @@ def callback_funcs_for_metrics(self, message: Message): 'emulator.' + key: value for key, value in content.items() } - self.msg_buffer['eval'][rnd][sender].update(**self.raw_metrics) + if self._cfg.llm.offsite_tuning.eval_type == 'full': + self.msg_buffer['eval'][rnd][sender].update(**self.raw_metrics) return self.check_and_move_on(check_eval_result=True) diff --git a/federatedscope/llm/offsite_tuning/utils.py b/federatedscope/llm/offsite_tuning/utils.py index f3d9bb9f2..19d72d7ab 100644 --- a/federatedscope/llm/offsite_tuning/utils.py +++ b/federatedscope/llm/offsite_tuning/utils.py @@ -1,4 +1,5 @@ import gc +import os import copy import logging import torch @@ -7,10 +8,72 @@ from transformers import (OPTForCausalLM, GPT2LMHeadModel, BloomForCausalLM, LlamaForCausalLM) from federatedscope.llm.model.adapter_builder import AdapterModel +from federatedscope.llm.offsite_tuning.kd_trainer import KDTrainer +from federatedscope.core.auxiliaries.data_builder import get_data logger = logging.getLogger(__name__) +def add_prologue(module, prologue): + """ + This function is borrowed from offsite-tuning: + https://github.com/mit-han-lab/offsite-tuning/blob/main/offsite_tuning + /utils.py + """ + module.old_forward = module.forward + module.prologue = prologue + + def new_forward(self): + def lambda_forward(*args, **kwargs): + self.input_args = args + self.input_kwargs = kwargs + if self.prologue is not None: + x = self.prologue(args[0]) + else: + x = args[0] + args = (x, ) + args[1:] + return self.old_forward(*args, **kwargs) + + return lambda_forward + + module.forward = new_forward(module) + return module + + +def add_epilogue(module, epilogue): + """ + This function is borrowed from offsite-tuning: + https://github.com/mit-han-lab/offsite-tuning/blob/main/offsite_tuning + /utils.py + """ + module.old_forward = module.forward + module.epilogue = epilogue + + def new_forward(self): + def lambda_forward(*args, **kwargs): + output = self.old_forward(*args, **kwargs) + if isinstance(output, tuple): + x = output[0] + else: + x = output + + if self.epilogue is not None: + x = self.epilogue(x) + + if isinstance(output, tuple): + output = (x, ) + output[1:] + else: + output = x + + self.cached_output = x + return output + + return lambda_forward + + module.forward = new_forward(module) + return module + + def get_layers(adapter_model): """ Modified from the official implementation: @@ -46,6 +109,11 @@ def set_layers(adapter_model, layers): logger.warning(f'Model {type(adapter_model.model)} not support, ' f'use default setting.') adapter_model.model.transformer.h = layers + adapter_model.student = layers + add_prologue(adapter_model.student[0], None) + add_epilogue(adapter_model.student[-1], None) + adapter_model.student_l = adapter_model.student[0] + adapter_model.student_r = adapter_model.student[-1] return adapter_model @@ -96,6 +164,8 @@ def generate_emulator_and_adapter(model: AdapterModel, for param in layer.parameters(): param.data = param.data.float() param.requires_grad = False + # Set teacher model + model.teacher = layers[l:r] emulator = COMP_FUNC_MAPPING[strategy](layers[l:r], **kwargs) @@ -114,9 +184,147 @@ def generate_emulator_and_adapter(model: AdapterModel, emulator_and_adapter.append(layers[idx]) new_model = copy.deepcopy(model) + # Set student model new_model = set_layers(new_model, emulator_and_adapter) gc.collect() torch.cuda.empty_cache() return new_model + + +def align_student_with_teacher(raw_model, adap_model, cfg, device, monitor): + def build_cfg_for_alignment(config): + new_cfg = copy.deepcopy(config) + new_cfg.defrost() + + # Overwrite `config.train` with + # `config.llm.offsite_tuning.emu_align.train` + for key, value in \ + new_cfg.llm.offsite_tuning.emu_align.train.optimizer.items(): + if key.startswith('__'): + continue + setattr(new_cfg, f'train.optimizer.{key}', value) + new_cfg.train.local_update_steps = \ + config.llm.offsite_tuning.emu_align.train.local_update_steps + new_cfg.train.batch_or_epoch = \ + config.llm.offsite_tuning.emu_align.train.batch_or_epoch + + # Overwrite `config.data` with + # `config.llm.offsite_tuning.emu_align.data` + for key, value in \ + new_cfg.llm.offsite_tuning.emu_align.data.items(): + if key.startswith('__'): + continue + setattr(new_cfg, f'data.{key}', value) + # Used for data translator + new_cfg.federate.client_num = 1 + + # TODO: might generate extra cfg file, delete + new_cfg.freeze() + return new_cfg + + does_train_emulator = True + if cfg.llm.offsite_tuning.emu_align.restore_from != '': + try: + if not os.path.exists( + cfg.llm.offsite_tuning.emu_align.restore_from): + logger.warning( + f'Invalid `emu_align.restore_from`:' + f' {cfg.llm.offsite_tuning.emu_align.restore_from}.') + else: + assert adap_model is not None + ckpt = torch.load( + cfg.llm.offsite_tuning.emu_align.restore_from, + map_location='cpu') + adap_model.load_state_dict(ckpt['model'], strict=False) + logger.info("Restored the adapter and emulator from ckpt") + logger.warning( + "Please make sure the dtype of model keep the same.") + # Make student un-trainable + for layer in adap_model.student: + for param in layer.parameters(): + param.requires_grad = False + does_train_emulator = False + except Exception as error: + logger.error(error) + + if not does_train_emulator: + return adap_model + + new_cfg = build_cfg_for_alignment(cfg) + + # Make student trainable + for layer in adap_model.student: + for param in layer.parameters(): + param.requires_grad = True + + # Loading held-out data + logger.info('Loading held-out dataset for alignment...') + data, modified_cfg = get_data(new_cfg.clone()) + new_cfg.merge_from_other_cfg(modified_cfg) + + # Create `KDTrainer` and train + kd_trainer = KDTrainer(raw_model, + adap_model, + data[1], + device, + new_cfg, + only_for_eval=False, + monitor=monitor) + logger.info('Start to align student model with teacher model...') + kd_trainer.train() + logger.info('Alignment finished!') + + # Save aligned model + adap_model.save_model(cfg.llm.offsite_tuning.emu_align.save_to) + + # Make student un-trainable + for layer in adap_model.student: + for param in layer.parameters(): + param.requires_grad = False + + return adap_model + + +def wrap_offsite_tuning_for_eval(model, config): + logger.info('===============use offsite tuning===============') + # We use offsite-tuning in this experiment + # Use adapter model instead + compress_strategy = config.llm.offsite_tuning.strategy + emulator_l = config.llm.offsite_tuning.emu_l + emulator_r = config.llm.offsite_tuning.emu_r + offsite_tuning_kwargs = config.llm.offsite_tuning.kwargs[0] + adap_model = \ + generate_emulator_and_adapter(model, + strategy=compress_strategy, + emulator_l=emulator_l, + emulator_r=emulator_r, + **offsite_tuning_kwargs) + # Load kd model if ckpt exits + if config.llm.offsite_tuning.emu_align.use: + if config.llm.offsite_tuning.emu_align.restore_from != '': + try: + ckpt = torch.load( + config.llm.offsite_tuning.emu_align.restore_from, + map_location='cpu', + ) + adap_model.load_state_dict(ckpt['model'], strict=False) + logger.info("Restored the adapter and emulator from ckpt") + except Exception as error: + logger.warning(error) + + if config.llm.offsite_tuning.eval_type == 'emu': + model = adap_model + elif config.llm.offsite_tuning.eval_type == 'full': + new_model_state_dict = model.state_dict() + for key, value in zip(model.state_dict().keys(), + adap_model.state_dict().values()): + new_model_state_dict[key] = value + model.load_state_dict(new_model_state_dict, strict=False) + del adap_model + else: + raise NotImplementedError( + '`config.llm.offsite_tuning.eval_type` should be chosen from ' + '`["emu", "full"]`.') + return model diff --git a/federatedscope/llm/trainer/trainer.py b/federatedscope/llm/trainer/trainer.py index 83d0b660c..405cbf0c0 100644 --- a/federatedscope/llm/trainer/trainer.py +++ b/federatedscope/llm/trainer/trainer.py @@ -69,6 +69,15 @@ def _hook_on_fit_end(self, ctx): } setattr(ctx, 'eval_metrics', eval_results) + # TODO: make this as a hook function + # Move trainable part to `cpu`, which can save memory but cost time + if ctx.cfg.llm.adapter.mv_to_cpu: + for p in ctx.model.parameters(): + if p.requires_grad: + p.data = p.to('cpu') + if p.grad is not None: + p.grad.data = p.grad.to('cpu') + def _hook_on_batch_forward_flop_count(self, ctx): """ The monitoring hook to calculate the flops during the fl course From ec9026d56bd38125ff86f847e237674a5fdc429d Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Mon, 31 Jul 2023 17:13:49 -1000 Subject: [PATCH 18/23] fix_div_by_zero (#673) --- federatedscope/llm/trainer/trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/federatedscope/llm/trainer/trainer.py b/federatedscope/llm/trainer/trainer.py index 405cbf0c0..de5582bfc 100644 --- a/federatedscope/llm/trainer/trainer.py +++ b/federatedscope/llm/trainer/trainer.py @@ -61,11 +61,13 @@ def _hook_on_batch_end(self, ctx): ctx.loss_regular_total += float(ctx.get("loss_regular", 0.)) def _hook_on_fit_end(self, ctx): + avg_loss = 0 if float( + ctx.num_samples) == 0 else ctx.loss_batch_total / float( + ctx.num_samples) eval_results = { f'{ctx.cur_split}_loss': ctx.loss_batch_total, f'{ctx.cur_split}_total': ctx.num_samples, - f'{ctx.cur_split}_avg_loss': ctx.loss_batch_total / - float(ctx.num_samples), + f'{ctx.cur_split}_avg_loss': avg_loss, } setattr(ctx, 'eval_metrics', eval_results) From c09bfe03ad51abac14e497897a24d4a0af176722 Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Mon, 31 Jul 2023 21:10:33 -1000 Subject: [PATCH 19/23] Fix offsite tuning eval (#674) --- federatedscope/llm/misc/fschat.py | 25 +++++++++++---------- federatedscope/llm/offsite_tuning/server.py | 3 +++ federatedscope/llm/offsite_tuning/utils.py | 16 ++++++++++++- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/federatedscope/llm/misc/fschat.py b/federatedscope/llm/misc/fschat.py index 90c6f98d3..f55da9a98 100644 --- a/federatedscope/llm/misc/fschat.py +++ b/federatedscope/llm/misc/fschat.py @@ -22,22 +22,23 @@ def __init__(self, config): self.tokenizer, _ = get_tokenizer(model_name, config.data.root, config.llm.tok_len) self.model = get_llm(config) - if config.llm.offsite_tuning.use: - from federatedscope.llm.offsite_tuning.utils import \ - wrap_offsite_tuning_for_eval - self.model = wrap_offsite_tuning_for_eval(self.model, config) self.device = f'cuda:{config.device}' self.add_special_tokens = True - try: - ckpt = torch.load(config.federate.save_to, map_location='cpu') - if 'model' and 'cur_round' in ckpt: - self.model.load_state_dict(ckpt['model']) - else: - self.model.load_state_dict(ckpt) - except Exception as error: - print(f"{error}, will use raw model.") + if config.llm.offsite_tuning.use: + from federatedscope.llm.offsite_tuning.utils import \ + wrap_offsite_tuning_for_eval + self.model = wrap_offsite_tuning_for_eval(self.model, config) + else: + try: + ckpt = torch.load(config.federate.save_to, map_location='cpu') + if 'model' and 'cur_round' in ckpt: + self.model.load_state_dict(ckpt['model']) + else: + self.model.load_state_dict(ckpt) + except Exception as error: + print(f"{error}, will use raw model.") if config.train.is_enable_half: self.model.half() diff --git a/federatedscope/llm/offsite_tuning/server.py b/federatedscope/llm/offsite_tuning/server.py index f0f13b41a..45f84a654 100644 --- a/federatedscope/llm/offsite_tuning/server.py +++ b/federatedscope/llm/offsite_tuning/server.py @@ -49,6 +49,9 @@ def __init__(self, monitor=Monitor( config, monitored_object=self)) + # No need for this attr + if hasattr(adap_model, 'teacher'): + del adap_model.teacher self.raw_model = model super(OffsiteTuningServer, diff --git a/federatedscope/llm/offsite_tuning/utils.py b/federatedscope/llm/offsite_tuning/utils.py index 19d72d7ab..6d2ea325b 100644 --- a/federatedscope/llm/offsite_tuning/utils.py +++ b/federatedscope/llm/offsite_tuning/utils.py @@ -277,6 +277,7 @@ def build_cfg_for_alignment(config): logger.info('Alignment finished!') # Save aligned model + del adap_model.teacher adap_model.save_model(cfg.llm.offsite_tuning.emu_align.save_to) # Make student un-trainable @@ -302,7 +303,8 @@ def wrap_offsite_tuning_for_eval(model, config): emulator_r=emulator_r, **offsite_tuning_kwargs) # Load kd model if ckpt exits - if config.llm.offsite_tuning.emu_align.use: + if config.llm.offsite_tuning.emu_align.use and \ + config.llm.offsite_tuning.eval_type == 'emu': if config.llm.offsite_tuning.emu_align.restore_from != '': try: ckpt = torch.load( @@ -314,9 +316,21 @@ def wrap_offsite_tuning_for_eval(model, config): except Exception as error: logger.warning(error) + # Load ckpt for eval + try: + ckpt = torch.load(config.federate.save_to, map_location='cpu') + if 'model' and 'cur_round' in ckpt: + adap_model.load_state_dict(ckpt['model']) + else: + adap_model.load_state_dict(ckpt) + except Exception as error: + logger.warning(f"{error}, will use raw model.") + if config.llm.offsite_tuning.eval_type == 'emu': model = adap_model + del model.teacher elif config.llm.offsite_tuning.eval_type == 'full': + # Raw model load adapter from adapter_and_emulator new_model_state_dict = model.state_dict() for key, value in zip(model.state_dict().keys(), adap_model.state_dict().values()): From 366a1806a7049cf194bb06b5f0706a8a5216aef7 Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Wed, 2 Aug 2023 20:06:07 -1000 Subject: [PATCH 20/23] Fix and update distillation (#675) --- federatedscope/core/configs/cfg_llm.py | 1 + .../offsite_tuning/dolly/dolly_fed.yaml | 6 +-- .../exp_yaml/offsite_tuning/gsm/gsm_fed.yaml | 6 +-- .../offsite_tuning/rosetta/rosetta_fed.yaml | 6 +-- .../rosetta_9_clients/rosetta_federate.yaml | 6 +-- federatedscope/llm/offsite_tuning/server.py | 3 ++ federatedscope/llm/offsite_tuning/utils.py | 41 +++++++++++++------ 7 files changed, 44 insertions(+), 25 deletions(-) diff --git a/federatedscope/core/configs/cfg_llm.py b/federatedscope/core/configs/cfg_llm.py index 39a20920e..b128c87fb 100644 --- a/federatedscope/core/configs/cfg_llm.py +++ b/federatedscope/core/configs/cfg_llm.py @@ -54,6 +54,7 @@ def extend_llm_cfg(cfg): cfg.llm.offsite_tuning.emu_align.use = False cfg.llm.offsite_tuning.emu_align.restore_from = '' cfg.llm.offsite_tuning.emu_align.save_to = '' + cfg.llm.offsite_tuning.emu_align.exit_after_align = False # Server held-out data cfg.llm.offsite_tuning.emu_align.data = CN() diff --git a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/dolly/dolly_fed.yaml b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/dolly/dolly_fed.yaml index d5424e2c9..15c156b4a 100644 --- a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/dolly/dolly_fed.yaml +++ b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/dolly/dolly_fed.yaml @@ -34,10 +34,10 @@ llm: train: local_update_steps: 500 batch_or_epoch: 'batch' - lm_loss_weight: 0.1 - kd_loss_weight: 0.9 + lm_loss_weight: 0.0 + kd_loss_weight: 1.0 optimizer: - lr: 0.001 + lr: 0.0001 dataloader: batch_size: 1 model: diff --git a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/gsm/gsm_fed.yaml b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/gsm/gsm_fed.yaml index 012a454e3..d7866a809 100644 --- a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/gsm/gsm_fed.yaml +++ b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/gsm/gsm_fed.yaml @@ -34,10 +34,10 @@ llm: train: local_update_steps: 500 batch_or_epoch: 'batch' - lm_loss_weight: 0.1 - kd_loss_weight: 0.9 + lm_loss_weight: 0.0 + kd_loss_weight: 1.0 optimizer: - lr: 0.001 + lr: 0.0001 dataloader: batch_size: 1 model: diff --git a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/rosetta/rosetta_fed.yaml b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/rosetta/rosetta_fed.yaml index d10cdf132..461b83f7f 100644 --- a/federatedscope/llm/baseline/exp_yaml/offsite_tuning/rosetta/rosetta_fed.yaml +++ b/federatedscope/llm/baseline/exp_yaml/offsite_tuning/rosetta/rosetta_fed.yaml @@ -34,10 +34,10 @@ llm: train: local_update_steps: 500 batch_or_epoch: 'batch' - lm_loss_weight: 0.1 - kd_loss_weight: 0.9 + lm_loss_weight: 0.0 + kd_loss_weight: 1.0 optimizer: - lr: 0.001 + lr: 0.0001 dataloader: batch_size: 1 model: diff --git a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_federate.yaml b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_federate.yaml index 8ee587ff0..214530db9 100644 --- a/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_federate.yaml +++ b/federatedscope/llm/baseline/exp_yaml/rosetta_9_clients/rosetta_federate.yaml @@ -7,7 +7,7 @@ federate: client_num: 9 total_round_num: 500 save_to: "llama_rosetta_9_fed_30*500_0.003_32_0.1.ckpt" - save_freq: 100 + save_freq: -1 share_local_model: True online_aggr: False data: @@ -21,7 +21,7 @@ llm: max_len: 1000 adapter: use: True - args: [ { 'adapter_package': 'peft', 'adapter_method': 'lora', 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.1 } ] + args: [ { 'adapter_package': 'peft', 'adapter_method': 'lora', 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.0 } ] dataloader: batch_size: 1 model: @@ -30,7 +30,7 @@ train: local_update_steps: 30 batch_or_epoch: batch optimizer: - lr: 0.003 + lr: 0.0001 weight_decay: 0.0 is_enable_half: True criterion: diff --git a/federatedscope/llm/offsite_tuning/server.py b/federatedscope/llm/offsite_tuning/server.py index 45f84a654..264e73e4b 100644 --- a/federatedscope/llm/offsite_tuning/server.py +++ b/federatedscope/llm/offsite_tuning/server.py @@ -1,3 +1,4 @@ +import os import logging from federatedscope.core.message import Message @@ -49,6 +50,8 @@ def __init__(self, monitor=Monitor( config, monitored_object=self)) + if config.llm.offsite_tuning.emu_align.exit_after_align: + os._exit(0) # No need for this attr if hasattr(adap_model, 'teacher'): del adap_model.teacher diff --git a/federatedscope/llm/offsite_tuning/utils.py b/federatedscope/llm/offsite_tuning/utils.py index 6d2ea325b..5aff9d172 100644 --- a/federatedscope/llm/offsite_tuning/utils.py +++ b/federatedscope/llm/offsite_tuning/utils.py @@ -95,7 +95,7 @@ def get_layers(adapter_model): return layers -def set_layers(adapter_model, layers): +def set_layers(adapter_model, layers, emu_l=0, emu_r=-1): if isinstance(adapter_model.model, OPTForCausalLM): adapter_model.model.model.decoder.layers = layers elif isinstance(adapter_model.model, GPT2LMHeadModel): @@ -109,7 +109,8 @@ def set_layers(adapter_model, layers): logger.warning(f'Model {type(adapter_model.model)} not support, ' f'use default setting.') adapter_model.model.transformer.h = layers - adapter_model.student = layers + adapter_model.student = layers[emu_l:emu_r] + adapter_model.adapter = layers[:emu_l] + layers[emu_r:] add_prologue(adapter_model.student[0], None) add_epilogue(adapter_model.student[-1], None) adapter_model.student_l = adapter_model.student[0] @@ -165,7 +166,7 @@ def generate_emulator_and_adapter(model: AdapterModel, param.data = param.data.float() param.requires_grad = False # Set teacher model - model.teacher = layers[l:r] + model.teacher = layers[l:r] # Ref for old model emulator = COMP_FUNC_MAPPING[strategy](layers[l:r], **kwargs) @@ -185,7 +186,7 @@ def generate_emulator_and_adapter(model: AdapterModel, new_model = copy.deepcopy(model) # Set student model - new_model = set_layers(new_model, emulator_and_adapter) + new_model = set_layers(new_model, emulator_and_adapter, l, r) gc.collect() torch.cuda.empty_cache() @@ -193,6 +194,17 @@ def generate_emulator_and_adapter(model: AdapterModel, return new_model +def convert_layers_train_state(layers, is_trainable=True): + if is_trainable: + for layer in layers: + for param in layer.parameters(): + param.requires_grad = True + else: + for layer in layers: + for param in layer.parameters(): + param.requires_grad = False + + def align_student_with_teacher(raw_model, adap_model, cfg, device, monitor): def build_cfg_for_alignment(config): new_cfg = copy.deepcopy(config) @@ -242,22 +254,24 @@ def build_cfg_for_alignment(config): logger.warning( "Please make sure the dtype of model keep the same.") # Make student un-trainable - for layer in adap_model.student: - for param in layer.parameters(): - param.requires_grad = False + convert_layers_train_state(adap_model.student, + is_trainable=False) does_train_emulator = False except Exception as error: logger.error(error) + # Case1: Load ckpt, so we do not need to train student if not does_train_emulator: return adap_model + # Case2: Restore fail or not assigned, start to train student new_cfg = build_cfg_for_alignment(cfg) + # Make adapter un-trainable + convert_layers_train_state(adap_model.adapter, is_trainable=False) + # Make student trainable - for layer in adap_model.student: - for param in layer.parameters(): - param.requires_grad = True + convert_layers_train_state(adap_model.student, is_trainable=True) # Loading held-out data logger.info('Loading held-out dataset for alignment...') @@ -280,10 +294,11 @@ def build_cfg_for_alignment(config): del adap_model.teacher adap_model.save_model(cfg.llm.offsite_tuning.emu_align.save_to) + # Make adapter trainable + convert_layers_train_state(adap_model.adapter, is_trainable=True) + # Make student un-trainable - for layer in adap_model.student: - for param in layer.parameters(): - param.requires_grad = False + convert_layers_train_state(adap_model.student, is_trainable=False) return adap_model From 0cb50406784e334c47e1982b88576052989ecfd9 Mon Sep 17 00:00:00 2001 From: qbc Date: Tue, 8 Aug 2023 20:49:31 +0800 Subject: [PATCH 21/23] fix bugs for local train of ot (#678) --- federatedscope/llm/offsite_tuning/server.py | 17 +++++++++-------- federatedscope/main.py | 4 +++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/federatedscope/llm/offsite_tuning/server.py b/federatedscope/llm/offsite_tuning/server.py index 264e73e4b..ad0a0dd8f 100644 --- a/federatedscope/llm/offsite_tuning/server.py +++ b/federatedscope/llm/offsite_tuning/server.py @@ -60,14 +60,15 @@ def __init__(self, super(OffsiteTuningServer, self).__init__(ID, state, config, data, adap_model, client_num, total_round_num, device, strategy, **kwargs) - self.raw_model_trainer = get_trainer(model=self.raw_model, - data=self.data, - device=self.device, - config=self._cfg, - only_for_eval=True, - monitor=Monitor( - self._cfg, - monitored_object=self)) + if self._cfg.llm.offsite_tuning.eval_type == 'full': + self.raw_model_trainer = get_trainer(model=self.raw_model, + data=self.data, + device=self.device, + config=self._cfg, + only_for_eval=True, + monitor=Monitor( + self._cfg, + monitored_object=self)) def trigger_for_feat_engr(self, trigger_train_func, diff --git a/federatedscope/main.py b/federatedscope/main.py index 53b8d680b..f404f30cc 100644 --- a/federatedscope/main.py +++ b/federatedscope/main.py @@ -49,7 +49,9 @@ if init_cfg.federate.client_idx_for_local_train != 0: init_cfg.federate.client_num = 1 - data = {1: data[init_cfg.federate.client_idx_for_local_train]} + new_data = {0: data[0]} if 0 in data.keys() else dict() + new_data[1] = data[init_cfg.federate.client_idx_for_local_train] + data = new_data init_cfg.freeze() From 688b55d9c86988ed70681aae90923a3f3dcdbdfe Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Tue, 8 Aug 2023 20:10:44 -1000 Subject: [PATCH 22/23] Fix save best model(#679) --- federatedscope/core/workers/server.py | 37 ++++++++++++++++++--------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/federatedscope/core/workers/server.py b/federatedscope/core/workers/server.py index afa43a730..b73ab7799 100644 --- a/federatedscope/core/workers/server.py +++ b/federatedscope/core/workers/server.py @@ -612,11 +612,35 @@ def merge_eval_results_from_all_clients(self): del formatted_logs[key] logger.info(formatted_logs) formatted_logs_all_set.update(formatted_logs) - update_best_this_round = self._monitor.update_best_result( + self._monitor.update_best_result( self.best_results, metrics_all_clients, results_type="unseen_client_best_individual" if merge_type == "unseen" else "client_best_individual") + + self._monitor.save_formatted_results(formatted_logs) + + update_prior = -1 # Bigger the higher priority + update_prior_list = ['fairness', 'avg', 'weighted_avg'] + update_best_this_round = False + for form in self._cfg.eval.report: + if form in update_prior_list: + update_prior_tmp = update_prior_list.index(form) + else: + update_prior_tmp = -1 + if form != "raw": + metric_name = form + "_unseen" if merge_type == \ + "unseen" else form + update_best_this_round_tmp = \ + self._monitor.update_best_result( + self.best_results, + formatted_logs[f"Results_{metric_name}"], + results_type=f"unseen_client_summarized_{form}" + if merge_type == "unseen" else + f"client_summarized_{form}") + if update_prior_tmp >= update_prior: + update_prior = update_prior_tmp + update_best_this_round = update_best_this_round_tmp if update_best_this_round: # When the frequency of evaluations is high, # the frequency of writing to disk in the early stages @@ -624,17 +648,6 @@ def merge_eval_results_from_all_clients(self): if self._cfg.federate.save_to != '': self.aggregator.save_model(self._cfg.federate.save_to, self.state) - self._monitor.save_formatted_results(formatted_logs) - for form in self._cfg.eval.report: - if form != "raw": - metric_name = form + "_unseen" if merge_type == \ - "unseen" else form - self._monitor.update_best_result( - self.best_results, - formatted_logs[f"Results_{metric_name}"], - results_type=f"unseen_client_summarized_{form}" - if merge_type == "unseen" else - f"client_summarized_{form}") return formatted_logs_all_set From 2805af09e947418e0aebe3450b2a7dd2f0e283c4 Mon Sep 17 00:00:00 2001 From: Weirui Kuang <39145382+rayrayraykk@users.noreply.github.com> Date: Thu, 10 Aug 2023 16:26:35 -1000 Subject: [PATCH 23/23] Need keep raw model when kd applied (#680) --- federatedscope/llm/offsite_tuning/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/federatedscope/llm/offsite_tuning/utils.py b/federatedscope/llm/offsite_tuning/utils.py index 5aff9d172..53c9be113 100644 --- a/federatedscope/llm/offsite_tuning/utils.py +++ b/federatedscope/llm/offsite_tuning/utils.py @@ -175,18 +175,22 @@ def generate_emulator_and_adapter(model: AdapterModel, # Adapter before Emulator for idx in range(l): emulator_and_adapter.append(layers[idx]) + emu_l = l # Emulator for idx in range(len(emulator)): emulator_and_adapter.append(emulator[idx]) + emu_r = emu_l + len(emulator) # Adapter after Emulator for idx in range(r, len(layers)): emulator_and_adapter.append(layers[idx]) + # Need keep raw model when kd applied new_model = copy.deepcopy(model) + new_emulator_and_adapter = copy.deepcopy(emulator_and_adapter) # Set student model - new_model = set_layers(new_model, emulator_and_adapter, l, r) + new_model = set_layers(new_model, new_emulator_and_adapter, emu_l, emu_r) gc.collect() torch.cuda.empty_cache()