From ab55445718b30403ac2faf2db31d746e55acf26e Mon Sep 17 00:00:00 2001 From: itayhubara Date: Thu, 18 Jan 2024 16:31:56 +0200 Subject: [PATCH 01/16] adding initial code drop for llm finetune --- llm_finetune/README.md | 90 ++++ llm_finetune/configs/default_config.yaml | 22 + llm_finetune/convergence_example.txt | 508 ++++++++++++++++++++++ llm_finetune/requirements.txt | 9 + llm_finetune/run_docker.sh | 2 + llm_finetune/run_llama_70B_scrolls_r16.sh | 24 + llm_finetune/scripts/eval.py | 190 ++++++++ llm_finetune/scripts/train.py | 221 ++++++++++ llm_finetune/scripts/utils.py | 281 ++++++++++++ 9 files changed, 1347 insertions(+) create mode 100644 llm_finetune/README.md create mode 100644 llm_finetune/configs/default_config.yaml create mode 100644 llm_finetune/convergence_example.txt create mode 100644 llm_finetune/requirements.txt create mode 100755 llm_finetune/run_docker.sh create mode 100644 llm_finetune/run_llama_70B_scrolls_r16.sh create mode 100644 llm_finetune/scripts/eval.py create mode 100644 llm_finetune/scripts/train.py create mode 100644 llm_finetune/scripts/utils.py diff --git a/llm_finetune/README.md b/llm_finetune/README.md new file mode 100644 index 000000000..91971dc10 --- /dev/null +++ b/llm_finetune/README.md @@ -0,0 +1,90 @@ +# LoRA benchmark + +LoRA benchmark on GPU (Nvidia A100 80GB). Inspired by [this blog post](https://medium.com/@sourabmangrulkar/falcon-180b-finetuning-using-peft-and-deepspeed-b92643091d99) and [this script](https://github.com/pacman100/DHS-LLM-Workshop/blob/main/chat_assistant/training/train.py). + + +## Setup + +Run the following: +```bash +sudo ./run_docker.sh +cd lora +pip install -r requirements.txt +``` + +> The Docker run command contains `-v /home/regis_huggingface_co/workspace:/root/workspace --workdir /root/workspace`. Feel free to change these flags at your own convenience. + +You will also need to run the following to install flash attention: +``` +pip install flash-attn --no-build-isolation +``` + +> For flash attention, make sure that the following command returns 0: +> ``` +> ninja --version >/dev/null && echo $? +> ``` +> If not, run +> ``` +> pip uninstall -y ninja && pip install ninja +> ``` +> and install `flash-attn` again. +> More information [here](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features). + +Make sure to have requested permission for donwloading Llama2 weights on the Hugging Face Hub: https://huggingface.co/meta-llama/Llama-2-7b-hf +Then, you will need to be connected to your Hugging Face account with a read token running: +``` +huggingface-cli login +``` + + +## Llama2-70B on 8 devices + +Run: +```bash +accelerate launch --config_file configs/default_config.yaml scripts/train.py \ +--model_name meta-llama/Llama-2-70b-hf \ +--dataset_name "tau/scrolls" --dataset_config_name "gov_report" \ +--max_seq_len 8192 \ +--bf16 True \ +--logging_steps 1 \ +--eval_steps 22 \ +--output_dir "/tmp/llama-70b" \ +--per_device_train_batch_size 1 \ +--gradient_accumulation_steps 1 \ +--dataset_text_field "input" \ +--lr_scheduler_type "cosine" \ +--learning_rate 1e-3 \ +--warmup_ratio 0.03 \ +--use_gradient_checkpointing True \ +--use_peft_lora True \ +--lora_r 16 \ +--lora_alpha 32 \ +--lora_dropout 0.1 \ +--max_steps 440 \ +--use_flash_attn \ +--lora_target_modules "q_proj,v_proj,k_proj,o_proj" +``` +where the Accelerate config file is [this one](https://github.com/regisss/lora/blob/main/configs/default_config.yaml). + +> Using flash attention with `--use_flash_attn` is necessary for training on 8k-token sequences. + +Learning curves of such a run can be found here: https://huggingface.co/regisss/test_5/tensorboard + + +## Evaluation + +To run evaluation for summarizing texts, you can run: +- Without LoRA adapter weights: + ``` + python scripts/eval.py --model_name meta-llama/Llama-2-70b-hf --max_new_tokens 900 --seq_length 8192 --do_sample --dataset_name "tau/scrolls" --dataset_config_name "gov_report" + ``` +- With LoRA adapter weights: + ``` + python scripts/eval.py --peft_model_name path_to_my_lora_model --max_new_tokens 900 --seq_length 8192 --do_sample --dataset_name "tau/scrolls" --dataset_config_name "gov_report" + ``` +## expected outcome + +A clean output (train and eval loss) of a singel run with 440 steps can be found under +``` + convergence_example.txt +``` \ No newline at end of file diff --git a/llm_finetune/configs/default_config.yaml b/llm_finetune/configs/default_config.yaml new file mode 100644 index 000000000..e422c0364 --- /dev/null +++ b/llm_finetune/configs/default_config.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + gradient_accumulation_steps: 1 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/llm_finetune/convergence_example.txt b/llm_finetune/convergence_example.txt new file mode 100644 index 000000000..9d5c9b218 --- /dev/null +++ b/llm_finetune/convergence_example.txt @@ -0,0 +1,508 @@ + 0%| | 0/440 [00:00 args.seq_length - args.max_new_tokens: + to_keep.append(False) + else: + to_keep.append(True) + return to_keep + + +test_dataset = test_dataset.filter( + filter_function, + batched=True, + num_proc=2, +) +print(f"Size of the test set: {len(test_dataset)}.") + + +@dataclass +class CustomDataCollator: + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + return_tensors: str = "pt" + + def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: + input_ids = [{"input_ids": sample["input_ids"]} for sample in features] + batch = self.tokenizer.pad( + input_ids, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + batch["ground_truth"] = [sample["ground_truth"] for sample in features] + return batch + + +dataloader = DataLoader( + test_dataset, + batch_size=1, + collate_fn=CustomDataCollator(tokenizer), +) + + +def postprocess_text(preds, labels): + preds = [pred.strip() for pred in preds] + labels = [label.strip() for label in labels] + + # rougeLSum expects newline after each sentence + preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] + labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] + + return preds, labels + + +metric = evaluate.load("rouge") + + +def compute_metrics(generated, ground_truth): + # Some simple post-processing + decoded_preds, decoded_labels = postprocess_text(generated, ground_truth) + result = metric.compute( + predictions=decoded_preds, references=decoded_labels, use_stemmer=True + ) + result = {k: round(v * 100, 4) for k, v in result.items()} + prediction_lens = [ + np.count_nonzero(gen != tokenizer.pad_token_id) for gen in generated + ] + result["gen_len"] = np.mean(prediction_lens) + return result + + +generated_sequences = [] +ground_truths = [] +for batch in tqdm(dataloader): + outputs = model.generate( + inputs=batch["input_ids"].to("cuda"),do_sample=args.do_sample , max_new_tokens=args.max_new_tokens + ) + outputs = [ + output.split("### Summary:\n ")[-1] + for output in tokenizer.batch_decode(outputs, skip_special_tokens=True) + ] + + print("Batch outputs:", outputs) + print("Batch ground truths:", batch["ground_truth"]) + generated_sequences += outputs + ground_truths += batch["ground_truth"] + print("Current results:", compute_metrics(generated_sequences, ground_truths)) + +res = compute_metrics(generated_sequences, ground_truths) +print("Final results:", res) diff --git a/llm_finetune/scripts/train.py b/llm_finetune/scripts/train.py new file mode 100644 index 000000000..cca88ebe8 --- /dev/null +++ b/llm_finetune/scripts/train.py @@ -0,0 +1,221 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass, field +from pathlib import Path +import os +import subprocess +from typing import Optional +import os +from transformers import HfArgumentParser, TrainingArguments, Trainer +from transformers.modeling_utils import unwrap_model +from utils import ( + create_and_prepare_model, + create_datasets, + SaveDeepSpeedPeftModelCallback, + peft_module_casting_to_bf16, +) + +# Define and parse arguments. +@dataclass +class ScriptArguments: + """ + These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train. + """ + + local_rank: Optional[int] = field( + default=-1, metadata={"help": "Used for multi-gpu"} + ) + + per_device_train_batch_size: Optional[int] = field(default=4) + per_device_eval_batch_size: Optional[int] = field(default=1) + gradient_accumulation_steps: Optional[int] = field(default=4) + learning_rate: Optional[float] = field(default=2e-4) + max_grad_norm: Optional[float] = field(default=0.3) + weight_decay: Optional[float] = field(default=0.001) + lora_alpha: Optional[int] = field(default=16) + lora_dropout: Optional[float] = field(default=0.1) + lora_r: Optional[int] = field(default=64) + lora_target_modules: Optional[str] = field( + default=None, + metadata={ + "help": "comma separated list of target modules to apply LoRA layers to" + }, + ) + max_seq_length: Optional[int] = field(default=512) + model_name: Optional[str] = field( + default=None, + metadata={ + "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc." + }, + ) + + dataset_name: Optional[str] = field( + default="tau/scrolls", + metadata={"help": "The preference dataset to use."}, + ) + num_train_epochs: Optional[int] = field( + default=1, + metadata={"help": "The number of training epochs for the reward model."}, + ) + fp16: Optional[bool] = field( + default=False, + metadata={"help": "Enables fp16 training."}, + ) + bf16: Optional[bool] = field( + default=False, + metadata={"help": "Enables bf16 training."}, + ) + gradient_checkpointing: Optional[bool] = field( + default=True, + metadata={"help": "Enables gradient checkpointing."}, + ) + optim: Optional[str] = field( + default="paged_adamw_32bit", + metadata={"help": "The optimizer to use."}, + ) + lr_scheduler_type: str = field( + default="constant", + metadata={ + "help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis" + }, + ) + max_steps: int = field( + default=-1, metadata={"help": "How many optimizer update steps to take"} + ) + warmup_ratio: float = field( + default=0.03, metadata={"help": "Fraction of steps to do a warmup for"} + ) + save_steps: int = field( + default=10, metadata={"help": "Save checkpoint every X updates steps."} + ) + eval_steps: int = field(default=10, metadata={"help": "Eval model every X steps."}) + logging_steps: int = field( + default=10, metadata={"help": "Log every X updates steps."} + ) + output_dir: str = field( + default="results", metadata={"help": "Where to store the final model."} + ) + use_flash_attn: Optional[bool] = field( + default=False, + metadata={"help": "Enables Flash attention for training."}, + ) + use_peft_lora: Optional[bool] = field( + default=False, + metadata={"help": "Enables PEFT LoRA for training."}, + ) + use_gradient_checkpointing: Optional[bool] = field( + default=False, + metadata={"help": "Enables Gradient Checkpointing."}, + ) + dataset_text_field: str = field( + default="text", metadata={"help": "Dataset field to use as input text."} + ) + push_to_hub: Optional[bool] = field( + default=False, + metadata={"help": "If True, pushes the model to the HF Hub"}, + ) + num_workers: int = field( + default=4, metadata={"help": "Number of dataset workers to use."} + ) + debug: Optional[bool] = field( + default=False, + metadata={ + "help": "If True, tests things like proper saving/loading/logging of model" + }, + ) + + dataset_config_name: Optional[str] = field(default="gov_report") + hub_model_id: Optional[str] = field(default=None) + seed: Optional[int] = field(default=42) + + +def main(args): + # training arguments + is_deepspeed_peft_enabled = ( + os.environ.get("ACCELERATE_USE_DEEPSPEED", "False").lower() == "true" + and args.use_peft_lora + ) + save_strategy = "steps" + training_arguments = TrainingArguments( + output_dir=args.output_dir, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + optim=args.optim, + learning_rate=args.learning_rate, + fp16=args.fp16, + bf16=args.bf16, + max_grad_norm=args.max_grad_norm, + warmup_ratio=args.warmup_ratio, + lr_scheduler_type=args.lr_scheduler_type, + num_train_epochs=args.num_train_epochs, + evaluation_strategy="steps", + save_strategy=save_strategy, + max_steps=args.max_steps, + eval_steps=args.eval_steps, + save_steps=args.save_steps, + logging_steps=args.logging_steps, + push_to_hub=args.push_to_hub, + gradient_checkpointing=args.use_gradient_checkpointing, + hub_model_id=args.hub_model_id, + report_to="tensorboard", + seed=args.seed, + ) + + # model + model, peft_config, tokenizer = create_and_prepare_model(args) + model.config.use_cache = False + + # datasets + train_dataset, eval_dataset = create_datasets(tokenizer, args) + + + # trainer + trainer = Trainer( + model=model, + args=training_arguments, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + trainer.accelerator.print(f"{trainer.model}") + if args.use_peft_lora: + trainer.model.print_trainable_parameters() + + if is_deepspeed_peft_enabled: + trainer.add_callback( + SaveDeepSpeedPeftModelCallback(trainer, save_steps=args.save_steps) + ) + + if args.use_peft_lora: + peft_module_casting_to_bf16(trainer.model, args) + + # train + trainer.train() + + # Save the PEFT adapter on main process + if trainer.args.process_index == 0: + if args.push_to_hub: + print("Push to hub...") + trainer.push_to_hub() + if args.use_peft_lora: + trainer.model.push_to_hub(args.output_dir) + else: + print("Save model...") + unwrap_model(trainer.model).save_pretrained(args.output_dir) + + +if __name__ == "__main__": + parser = HfArgumentParser(ScriptArguments) + args = parser.parse_args_into_dataclasses()[0] + main(args) diff --git a/llm_finetune/scripts/utils.py b/llm_finetune/scripts/utils.py new file mode 100644 index 000000000..8a00687bc --- /dev/null +++ b/llm_finetune/scripts/utils.py @@ -0,0 +1,281 @@ +import random +import torch +from torch.utils.data import IterableDataset +from datasets import load_dataset +from tqdm import tqdm +import warnings +from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training + +from peft.tuners.lora import LoraLayer +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + TrainerCallback, + TrainingArguments, + TrainerState, + TrainerControl, +) +from itertools import chain +from functools import partial + + +class SaveDeepSpeedPeftModelCallback(TrainerCallback): + def __init__(self, trainer, save_steps=500): + self.trainer = trainer + self.save_steps = save_steps + + def on_step_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if (state.global_step + 1) % self.save_steps == 0: + self.trainer.accelerator.wait_for_everyone() + state_dict = self.trainer.accelerator.get_state_dict(self.trainer.deepspeed) + unwrapped_model = self.trainer.accelerator.unwrap_model( + self.trainer.deepspeed + ) + if self.trainer.accelerator.is_main_process: + unwrapped_model.save_pretrained(args.output_dir, state_dict=state_dict) + self.trainer.accelerator.wait_for_everyone() + return control + + +class ConstantLengthDataset(IterableDataset): + """ + Iterable dataset that returns constant length chunks of tokens from stream of text files. + Args: + tokenizer (Tokenizer): The processor used for proccessing the data. + dataset (dataset.Dataset): Dataset with text files. + infinite (bool): If True the iterator is reset after dataset reaches end else stops. + seq_length (int): Length of token sequences to return. + num_of_sequences (int): Number of token sequences to keep in buffer. + chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer. + shuffle (bool): If true, the samples in each buffer are suffled. Default is `True`. + add_eos_token (bool): If true, each buffer is delimited with eos token. Default is `True`. + """ + + def __init__( + self, + tokenizer, + dataset, + infinite=False, + seq_length=1024, + num_of_sequences=1024, + chars_per_token=3.6, + content_field="content", + shuffle=True, + add_eos_token=True, + ): + self.tokenizer = tokenizer + self.concat_token_id = tokenizer.eos_token_id + self.dataset = dataset + self.seq_length = seq_length + self.infinite = infinite + self.current_size = 0 + self.max_buffer_size = seq_length * chars_per_token * num_of_sequences + self.content_field = content_field + self.shuffle = shuffle + self.add_eos_token = add_eos_token + + def __iter__(self): + iterator = iter(self.dataset) + more_examples = True + while more_examples: + buffer, buffer_len = [], 0 + while True: + if buffer_len >= self.max_buffer_size: + break + try: + buffer.append(next(iterator)[self.content_field]) + buffer_len += len(buffer[-1]) + except StopIteration: + if self.infinite: + iterator = iter(self.dataset) + else: + more_examples = False + break + + tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"] + all_token_ids = [] + for tokenized_input in tokenized_inputs: + if self.add_eos_token: + tokenized_input = tokenized_input + [self.concat_token_id] + all_token_ids.extend(tokenized_input) + examples = [] + for i in range(0, len(all_token_ids), self.seq_length): + input_ids = all_token_ids[i : i + self.seq_length] + if len(input_ids) == self.seq_length: + examples.append(input_ids) + if self.shuffle: + random.shuffle(examples) + for example in examples: + self.current_size += 1 + yield { + "input_ids": torch.LongTensor(example), + "labels": torch.LongTensor(example), + } + + +def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400): + """ + Estimate the average number of characters per token in the dataset. + """ + total_characters, total_tokens = 0, 0 + for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): + total_characters += len(example[data_column]) + total_tokens += len(tokenizer(example[data_column]).tokens()) + + return total_characters / total_tokens + +def group_texts(examples, block_size): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + +def create_datasets(tokenizer, args): + dataset = load_dataset( + args.dataset_name, + args.dataset_config_name, + use_auth_token=True, + num_proc=args.num_workers, + ) + train_dataset = dataset["train"] + valid_dataset = dataset["validation"] + column_names = train_dataset.features + + def tokenize_function(example): + output_texts = [] + for i in range(len(example["input"])): + if 'gov_report' in args.dataset_config_name: + output_texts.append( + f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n {example['output'][i]}{tokenizer.eos_token}" + ) + else: + output_texts.append( + f"### {example['input'][i]}\n ### The answer is:\n {example['output'][i]}{tokenizer.eos_token}" + ) + input_ids = tokenizer( + output_texts, padding="max_length", max_length=8192 + ).input_ids + + return {"input_ids": input_ids} + + train_dataset = train_dataset.map( + tokenize_function, + batched=True, + num_proc=8, + remove_columns=column_names, + ) + valid_dataset = valid_dataset.map( + tokenize_function, + batched=True, + num_proc=2, + remove_columns=column_names, + ) + + def filter_function(example): + to_keep = [] + for i in range(len(example["input_ids"])): + if len(example["input_ids"][i]) > args.max_seq_length: + to_keep.append(False) + else: + to_keep.append(True) + return to_keep + + train_dataset = train_dataset.filter( + filter_function, + batched=True, + # with_indices=True, + num_proc=8, + # remove_columns=column_names, + ) + valid_dataset = valid_dataset.filter( + filter_function, + batched=True, + # with_indices=True, + num_proc=8, + # remove_columns=column_names, + ) + + if args.use_flash_attn: + packing_method = partial(group_texts, block_size=args.max_seq_length) + # Packing + train_dataset = train_dataset.map( + packing_method, + batched=True, + num_proc=8, + ) + valid_dataset = valid_dataset.map( + packing_method, + batched=True, + num_proc=8, + ) + + print( + f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" + ) + + return train_dataset, valid_dataset + + +def create_and_prepare_model(args): + device_map = None + + model = AutoModelForCausalLM.from_pretrained( + args.model_name, + device_map=device_map, + use_cache=not args.use_gradient_checkpointing, + trust_remote_code=True, + use_flash_attention_2=True if args.use_flash_attn else False, + torch_dtype=torch.bfloat16, + ) + + peft_config = None + if args.use_peft_lora: + peft_config = LoraConfig( + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + bias="none", + task_type="CAUSAL_LM", + target_modules=None + if args.lora_target_modules is None + else args.lora_target_modules.split(","), + ) + if args.use_gradient_checkpointing: + model.gradient_checkpointing_enable() + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) + tokenizer.pad_token = tokenizer.eos_token + + return model, peft_config, tokenizer + + +def peft_module_casting_to_bf16(model, args): + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + if args.bf16: + module = module.to(torch.bfloat16) + if "norm" in name: + module = module.to(torch.float32) + if any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]): + if hasattr(module, "weight"): + if args.bf16 and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) From 6a6ca47da5aa2319aaa6bdf61bc7870f6a3da180 Mon Sep 17 00:00:00 2001 From: itayhubara Date: Thu, 1 Feb 2024 14:44:58 +0200 Subject: [PATCH 02/16] (a) fixing padding issue; (b) masking input tokens for eval dataset; (c) adding support for mlloger --- llm_finetune/scripts/mlperf_logging_utils.py | 103 +++++++++++++++++++ llm_finetune/scripts/train.py | 18 +++- llm_finetune/scripts/utils.py | 56 ++++++---- 3 files changed, 154 insertions(+), 23 deletions(-) create mode 100644 llm_finetune/scripts/mlperf_logging_utils.py diff --git a/llm_finetune/scripts/mlperf_logging_utils.py b/llm_finetune/scripts/mlperf_logging_utils.py new file mode 100644 index 000000000..e725e23b8 --- /dev/null +++ b/llm_finetune/scripts/mlperf_logging_utils.py @@ -0,0 +1,103 @@ +import os +from mlperf_logging import mllog +from mlperf_logging.mllog import constants +import torch +import torch.distributed as dist +from transformers import TrainingArguments, TrainerCallback, TrainerState, TrainerControl + + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + +def barrier(): + if not is_dist_avail_and_initialized(): + return + torch.distributed.barrier() + +class LoraLogger: + def __init__(self,target_eval_loss=None, filename=None, default_stack_offset=2): + self.mllogger = mllog.get_mllogger() + mllog.config(default_stack_offset=default_stack_offset, + filename=(filename or os.getenv("COMPLIANCE_FILE") or "mlperf_compliance.log"), + root_dir=os.path.normpath(os.path.dirname(os.path.realpath(__file__)))) + self.target_eval_loss=target_eval_loss + + @property + def rank(self): + return get_rank() + + def event(self, key, value=None, metadata=None, sync=False, log_rank=None): + log_rank = self.rank==0 if log_rank is None else log_rank + if sync: + barrier() + if log_rank: + self.mllogger.event(key=key, value=value, metadata=metadata) + + def start(self, key, value=None, metadata=None, sync=False, log_rank=None): + log_rank = self.rank==0 if log_rank is None else log_rank + if sync: + barrier() + if log_rank: + self.mllogger.start(key=key, value=value, metadata=metadata) + + def end(self, key, value=None, metadata=None, sync=False, log_rank=None): + log_rank = self.rank==0 if log_rank is None else log_rank + if sync: + barrier() + if log_rank: + self.mllogger.end(key=key, value=value, metadata=metadata) + +def submission_info(mllogger: LoraLogger, + submission_benchmark: str, submission_division: str, submission_org: str, submission_platform: str, + submission_poc_name: str, submission_poc_email: str, submission_status: str): + """Logs required for a valid MLPerf submission.""" + mllogger.event(key=constants.SUBMISSION_BENCHMARK, value=submission_benchmark) + mllogger.event(key=constants.SUBMISSION_DIVISION, value=submission_division) + mllogger.event(key=constants.SUBMISSION_ORG, value=submission_org) + mllogger.event(key=constants.SUBMISSION_PLATFORM, value=submission_platform) + mllogger.event(key=constants.SUBMISSION_POC_NAME, value=submission_poc_name) + mllogger.event(key=constants.SUBMISSION_POC_EMAIL, value=submission_poc_email) + mllogger.event(key=constants.SUBMISSION_STATUS, value=submission_status) + +class MLPerfCallback(TrainerCallback): + "A callback that prints a message at the beginning of training" + def __init__(self,logger): + super().__init__() + self.mllogger = logger + + def on_train_begin(self, args, state, control, **kwargs): + self.mllogger.start(constants.RUN_START,value='') + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the beginning of a training step. If using gradient accumulation, one training step might take + several inputs. + """ + if state.global_step % (state.logging_steps) == 0 and state.global_step > 0: + self.mllogger.event('train_loss',value=state.log_history[-1]['loss'],metadata={"steps":state.log_history[-1]['step']}) + control.should_log = True + + if state.global_step % (state.eval_steps) == 0 and state.global_step>0: + self.mllogger.event('eval_loss',value=state.log_history[-1]['eval_loss'],metadata={"steps":state.log_history[-1]['step']}) + control.should_log = True + print(self.mllogger.target_eval_loss) + eval_loss_list=[sl['eval_loss'] for sl in state.log_history if 'eval_loss' in sl] + if eval_loss_list and eval_loss_list[-1]<=self.mllogger.target_eval_loss: + control.should_training_stop = True + self.mllogger.end(constants.RUN_STOP,value=eval_loss_list[-1],metadata={"steps":state.log_history[-1]['step'],"status": 'success'}) + if state.global_step >= state.max_steps: + control.should_training_stop = True + self.mllogger.end(constants.RUN_STOP,value=eval_loss_list[-1],metadata={"steps":state.log_history[-1]['step'],"status": 'fail'}) + + return control + diff --git a/llm_finetune/scripts/train.py b/llm_finetune/scripts/train.py index cca88ebe8..6b81a6a0b 100644 --- a/llm_finetune/scripts/train.py +++ b/llm_finetune/scripts/train.py @@ -20,6 +20,7 @@ import os from transformers import HfArgumentParser, TrainingArguments, Trainer from transformers.modeling_utils import unwrap_model +from mlperf_logging_utils import MLPerfCallback,LoraLogger,submission_info from utils import ( create_and_prepare_model, create_datasets, @@ -100,9 +101,12 @@ class ScriptArguments: save_steps: int = field( default=10, metadata={"help": "Save checkpoint every X updates steps."} ) - eval_steps: int = field(default=10, metadata={"help": "Eval model every X steps."}) + eval_steps: int = field(default=24, metadata={"help": "Eval model every X steps."}) logging_steps: int = field( - default=10, metadata={"help": "Log every X updates steps."} + default=6, metadata={"help": "Log every X updates steps."} + ) + target_eval_loss: float = field( + default=1.19, metadata={"help": "target eval loss - NOT FINAL."} ) output_dir: str = field( default="results", metadata={"help": "Where to store the final model."} @@ -180,6 +184,15 @@ def main(args): # datasets train_dataset, eval_dataset = create_datasets(tokenizer, args) + loralogger=LoraLogger(target_eval_loss=args.target_eval_loss) + submission_info(loralogger, + submission_benchmark="llm-finetuning", + submission_division="Closed", + submission_org="referece", + submission_platform="referece", + submission_poc_name="referece", + submission_poc_email="referece", + submission_status="referece") # trainer trainer = Trainer( @@ -187,6 +200,7 @@ def main(args): args=training_arguments, train_dataset=train_dataset, eval_dataset=eval_dataset, + callbacks=[MLPerfCallback(loralogger)], ) trainer.accelerator.print(f"{trainer.model}") if args.use_peft_lora: diff --git a/llm_finetune/scripts/utils.py b/llm_finetune/scripts/utils.py index 8a00687bc..1747eb03b 100644 --- a/llm_finetune/scripts/utils.py +++ b/llm_finetune/scripts/utils.py @@ -158,22 +158,34 @@ def create_datasets(tokenizer, args): valid_dataset = dataset["validation"] column_names = train_dataset.features - def tokenize_function(example): + def tokenize_function(example,eval=False): output_texts = [] + mask_labels_sizes=[] for i in range(len(example["input"])): if 'gov_report' in args.dataset_config_name: output_texts.append( f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n {example['output'][i]}{tokenizer.eos_token}" ) + if eval: + mask_labels_sizes.append(f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n") else: output_texts.append( f"### {example['input'][i]}\n ### The answer is:\n {example['output'][i]}{tokenizer.eos_token}" ) - input_ids = tokenizer( - output_texts, padding="max_length", max_length=8192 - ).input_ids + + input_ids = tokenizer(output_texts).input_ids - return {"input_ids": input_ids} + if eval: + labels_ids = tokenizer(mask_labels_sizes).input_ids + masked_labels=[] + for out,lb in zip(input_ids,labels_ids): + ml=out.copy() + ml[:len(lb)]=[-100]*len(lb) + ml[-1]=-100 + masked_labels.append(ml) + return {"input_ids": input_ids,"labels": masked_labels} + else: + return {"input_ids": input_ids} train_dataset = train_dataset.map( tokenize_function, @@ -182,7 +194,7 @@ def tokenize_function(example): remove_columns=column_names, ) valid_dataset = valid_dataset.map( - tokenize_function, + partial(tokenize_function,eval=True), batched=True, num_proc=2, remove_columns=column_names, @@ -208,23 +220,25 @@ def filter_function(example): filter_function, batched=True, # with_indices=True, - num_proc=8, + num_proc=2, # remove_columns=column_names, ) - - if args.use_flash_attn: - packing_method = partial(group_texts, block_size=args.max_seq_length) - # Packing - train_dataset = train_dataset.map( - packing_method, - batched=True, - num_proc=8, - ) - valid_dataset = valid_dataset.map( - packing_method, - batched=True, - num_proc=8, - ) + print( + f"Before packing, Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" + ) + + packing_method = partial(group_texts, block_size=args.max_seq_length) + # Packing + train_dataset = train_dataset.map( + packing_method, + batched=True, + num_proc=8, + ) + valid_dataset = valid_dataset.map( + packing_method, + batched=True, + num_proc=2, + ) print( f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" From 87992caa82add6a468f45ec599545520a37db90d Mon Sep 17 00:00:00 2001 From: itayhubara Date: Thu, 1 Feb 2024 18:28:24 +0200 Subject: [PATCH 03/16] fix masking bug --- llm_finetune/scripts/train.py | 2 +- llm_finetune/scripts/utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llm_finetune/scripts/train.py b/llm_finetune/scripts/train.py index 6b81a6a0b..ecf2b8b5a 100644 --- a/llm_finetune/scripts/train.py +++ b/llm_finetune/scripts/train.py @@ -106,7 +106,7 @@ class ScriptArguments: default=6, metadata={"help": "Log every X updates steps."} ) target_eval_loss: float = field( - default=1.19, metadata={"help": "target eval loss - NOT FINAL."} + default=0.92, metadata={"help": "target eval loss - NOT FINAL."} ) output_dir: str = field( default="results", metadata={"help": "Where to store the final model."} diff --git a/llm_finetune/scripts/utils.py b/llm_finetune/scripts/utils.py index 1747eb03b..c376a6c8b 100644 --- a/llm_finetune/scripts/utils.py +++ b/llm_finetune/scripts/utils.py @@ -143,7 +143,8 @@ def group_texts(examples, block_size): k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } - result["labels"] = result["input_ids"].copy() + if 'labels' not in result: + result["labels"] = result["input_ids"].copy() return result From 11e47c478f9f42790a832e39178b4f038ec97ead Mon Sep 17 00:00:00 2001 From: itayhubara Date: Mon, 5 Feb 2024 16:01:29 +0200 Subject: [PATCH 04/16] adding more logger support --- llm_finetune/README.md | 6 +++- llm_finetune/scripts/mlperf_logging_utils.py | 30 +++++++++++++++----- llm_finetune/scripts/train.py | 26 ++++++++++------- llm_finetune/scripts/utils.py | 5 ++++ 4 files changed, 48 insertions(+), 19 deletions(-) diff --git a/llm_finetune/README.md b/llm_finetune/README.md index 91971dc10..8a769c3b8 100644 --- a/llm_finetune/README.md +++ b/llm_finetune/README.md @@ -35,7 +35,11 @@ Then, you will need to be connected to your Hugging Face account with a read tok ``` huggingface-cli login ``` - +Finally please install mlperf logger: +``` +git clone https://github.com/mlperf/logging.git mlperf-logging +pip install -e mlperf-logging +``` ## Llama2-70B on 8 devices diff --git a/llm_finetune/scripts/mlperf_logging_utils.py b/llm_finetune/scripts/mlperf_logging_utils.py index e725e23b8..ffdca43fe 100644 --- a/llm_finetune/scripts/mlperf_logging_utils.py +++ b/llm_finetune/scripts/mlperf_logging_utils.py @@ -61,13 +61,29 @@ def submission_info(mllogger: LoraLogger, submission_benchmark: str, submission_division: str, submission_org: str, submission_platform: str, submission_poc_name: str, submission_poc_email: str, submission_status: str): """Logs required for a valid MLPerf submission.""" - mllogger.event(key=constants.SUBMISSION_BENCHMARK, value=submission_benchmark) - mllogger.event(key=constants.SUBMISSION_DIVISION, value=submission_division) - mllogger.event(key=constants.SUBMISSION_ORG, value=submission_org) - mllogger.event(key=constants.SUBMISSION_PLATFORM, value=submission_platform) - mllogger.event(key=constants.SUBMISSION_POC_NAME, value=submission_poc_name) - mllogger.event(key=constants.SUBMISSION_POC_EMAIL, value=submission_poc_email) - mllogger.event(key=constants.SUBMISSION_STATUS, value=submission_status) + if mllogger.rank==0: + mllogger.event(key=constants.SUBMISSION_BENCHMARK, value=submission_benchmark) + mllogger.event(key=constants.SUBMISSION_DIVISION, value=submission_division) + mllogger.event(key=constants.SUBMISSION_ORG, value=submission_org) + mllogger.event(key=constants.SUBMISSION_PLATFORM, value=submission_platform) + mllogger.event(key=constants.SUBMISSION_POC_NAME, value=submission_poc_name) + mllogger.event(key=constants.SUBMISSION_POC_EMAIL, value=submission_poc_email) + mllogger.event(key=constants.SUBMISSION_STATUS, value=submission_status) + +def general_info(mllogger: LoraLogger,args,world_size,eval_samples,train_samples): + if mllogger.rank==0: + mllogger.event(key=constants.GLOBAL_BATCH_SIZE, value=args.per_device_train_batch_size*args.gradient_accumulation_steps*world_size) + mllogger.event(key=constants.TRAIN_SAMPLES, value=train_samples) + mllogger.event(key=constants.EVAL_SAMPLES, value=eval_samples) + mllogger.event(key=constants.SEED, value=args.seed) + +def optimization_info(mllogger: LoraLogger,args): + """Logs required for a valid MLPerf submission.""" + if mllogger.rank==0: + mllogger.event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_ratio) + mllogger.event(key=constants.OPT_LR_TRAINING_STEPS, value=args.max_steps) + mllogger.event(key=constants.OPT_BASE_LR, value=args.learning_rate) + #mllogger.event(key=constants.OPT_LAMB_BETA_1, value=args.beta1) class MLPerfCallback(TrainerCallback): "A callback that prints a message at the beginning of training" diff --git a/llm_finetune/scripts/train.py b/llm_finetune/scripts/train.py index ecf2b8b5a..0e89e14e8 100644 --- a/llm_finetune/scripts/train.py +++ b/llm_finetune/scripts/train.py @@ -20,10 +20,11 @@ import os from transformers import HfArgumentParser, TrainingArguments, Trainer from transformers.modeling_utils import unwrap_model -from mlperf_logging_utils import MLPerfCallback,LoraLogger,submission_info +from mlperf_logging_utils import MLPerfCallback,LoraLogger,submission_info,general_info,optimization_info from utils import ( create_and_prepare_model, create_datasets, + world_size_from_yaml, SaveDeepSpeedPeftModelCallback, peft_module_casting_to_bf16, ) @@ -146,6 +147,15 @@ class ScriptArguments: def main(args): + loralogger=LoraLogger(target_eval_loss=args.target_eval_loss) + submission_info(loralogger, + submission_benchmark="llm-finetuning", + submission_division="Closed", + submission_org="referece", + submission_platform="referece", + submission_poc_name="referece", + submission_poc_email="referece", + submission_status="referece") # training arguments is_deepspeed_peft_enabled = ( os.environ.get("ACCELERATE_USE_DEEPSPEED", "False").lower() == "true" @@ -176,23 +186,17 @@ def main(args): report_to="tensorboard", seed=args.seed, ) - + # model model, peft_config, tokenizer = create_and_prepare_model(args) model.config.use_cache = False # datasets train_dataset, eval_dataset = create_datasets(tokenizer, args) + world_size = world_size_from_yaml(args.config_path) + general_info(loralogger,args,world_size=world_size,eval_samples=len(eval_dataset),train_samples=len(train_dataset)) + optimization_info(loralogger,args) - loralogger=LoraLogger(target_eval_loss=args.target_eval_loss) - submission_info(loralogger, - submission_benchmark="llm-finetuning", - submission_division="Closed", - submission_org="referece", - submission_platform="referece", - submission_poc_name="referece", - submission_poc_email="referece", - submission_status="referece") # trainer trainer = Trainer( diff --git a/llm_finetune/scripts/utils.py b/llm_finetune/scripts/utils.py index c376a6c8b..a5c211e23 100644 --- a/llm_finetune/scripts/utils.py +++ b/llm_finetune/scripts/utils.py @@ -5,6 +5,7 @@ from tqdm import tqdm import warnings from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training +import yaml from peft.tuners.lora import LoraLayer from transformers import ( @@ -247,6 +248,10 @@ def filter_function(example): return train_dataset, valid_dataset +def world_size_from_yaml(yaml_path): + with open(yaml_path, 'r') as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + return data['num_machines']*data['num_processes'] def create_and_prepare_model(args): device_map = None From 8f791c7e480f45c9ef0af9fb893b4f37c4b0f6e8 Mon Sep 17 00:00:00 2001 From: itayhubara Date: Mon, 5 Feb 2024 16:43:40 +0200 Subject: [PATCH 05/16] bug fix --- llm_finetune/scripts/train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llm_finetune/scripts/train.py b/llm_finetune/scripts/train.py index 0e89e14e8..3802dcb34 100644 --- a/llm_finetune/scripts/train.py +++ b/llm_finetune/scripts/train.py @@ -67,6 +67,10 @@ class ScriptArguments: default="tau/scrolls", metadata={"help": "The preference dataset to use."}, ) + config_path: Optional[str] = field( + default="./configs/default_config.yaml", + metadata={"help": "path to model config"}, + ) num_train_epochs: Optional[int] = field( default=1, metadata={"help": "The number of training epochs for the reward model."}, From efd899b2a92ff052645cbc015757ad78925220a1 Mon Sep 17 00:00:00 2001 From: itayhubara Date: Tue, 6 Feb 2024 15:17:23 +0200 Subject: [PATCH 06/16] fix logging bug and update HP --- llm_finetune/run_llama_70B_scrolls_r16.sh | 10 +++++----- llm_finetune/scripts/mlperf_logging_utils.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llm_finetune/run_llama_70B_scrolls_r16.sh b/llm_finetune/run_llama_70B_scrolls_r16.sh index d4f43689b..e1cb15f37 100644 --- a/llm_finetune/run_llama_70B_scrolls_r16.sh +++ b/llm_finetune/run_llama_70B_scrolls_r16.sh @@ -3,22 +3,22 @@ accelerate launch --config_file configs/default_config.yaml scripts/train.py \ --dataset_name "tau/scrolls" --dataset_config_name "gov_report" \ --max_seq_len 8192 \ --bf16 True \ ---logging_steps 1 \ ---eval_steps 22 \ ---save_steps 22 \ +--logging_steps 6 \ +--eval_steps 24 \ +--save_steps 24 \ --output_dir "./results/llama-70b_scrolls_gov_report_r16_$1" \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ --dataset_text_field "input" \ --lr_scheduler_type "cosine" \ ---learning_rate 1e-3 \ +--learning_rate 7e-4 \ --warmup_ratio 0.03 \ --use_gradient_checkpointing True \ --use_peft_lora True \ --lora_r 16 \ --lora_alpha 32 \ --lora_dropout 0.1 \ ---max_steps 440 \ +--max_steps 288 \ --use_flash_attn \ --seed "$1" \ --lora_target_modules "q_proj,v_proj,k_proj,o_proj" \ No newline at end of file diff --git a/llm_finetune/scripts/mlperf_logging_utils.py b/llm_finetune/scripts/mlperf_logging_utils.py index ffdca43fe..ca9298734 100644 --- a/llm_finetune/scripts/mlperf_logging_utils.py +++ b/llm_finetune/scripts/mlperf_logging_utils.py @@ -99,7 +99,7 @@ def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: T Event called at the beginning of a training step. If using gradient accumulation, one training step might take several inputs. """ - if state.global_step % (state.logging_steps) == 0 and state.global_step > 0: + if state.global_step % (state.logging_steps) == 0 and state.global_step > 0 and not state.global_step % (state.eval_steps) == 0: self.mllogger.event('train_loss',value=state.log_history[-1]['loss'],metadata={"steps":state.log_history[-1]['step']}) control.should_log = True From 8a9668f7e7d3d548304bbd06888ac89aeb39e3dc Mon Sep 17 00:00:00 2001 From: itayhubara Date: Mon, 19 Feb 2024 16:56:51 +0200 Subject: [PATCH 07/16] adding patch for memmory issue and fused model enablement --- llm_finetune/README.md | 2 + llm_finetune/scripts/mlperf_logging_utils.py | 2 +- llm_finetune/scripts/train.py | 19 +++-- llm_finetune/scripts/utils.py | 89 +++++++++++++++++++- 4 files changed, 104 insertions(+), 8 deletions(-) diff --git a/llm_finetune/README.md b/llm_finetune/README.md index 8a769c3b8..2ee454b01 100644 --- a/llm_finetune/README.md +++ b/llm_finetune/README.md @@ -40,6 +40,8 @@ Finally please install mlperf logger: git clone https://github.com/mlperf/logging.git mlperf-logging pip install -e mlperf-logging ``` +## Download Data +data can be downloaded from [mlperf drive](https://drive.google.com/drive/folders/1sfnK9m5FSQrWMqI2dajNTX2dxlJegR94) ## Llama2-70B on 8 devices diff --git a/llm_finetune/scripts/mlperf_logging_utils.py b/llm_finetune/scripts/mlperf_logging_utils.py index ca9298734..b752473ac 100644 --- a/llm_finetune/scripts/mlperf_logging_utils.py +++ b/llm_finetune/scripts/mlperf_logging_utils.py @@ -106,7 +106,7 @@ def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: T if state.global_step % (state.eval_steps) == 0 and state.global_step>0: self.mllogger.event('eval_loss',value=state.log_history[-1]['eval_loss'],metadata={"steps":state.log_history[-1]['step']}) control.should_log = True - print(self.mllogger.target_eval_loss) + eval_loss_list=[sl['eval_loss'] for sl in state.log_history if 'eval_loss' in sl] if eval_loss_list and eval_loss_list[-1]<=self.mllogger.target_eval_loss: control.should_training_stop = True diff --git a/llm_finetune/scripts/train.py b/llm_finetune/scripts/train.py index 3802dcb34..4ad8275b1 100644 --- a/llm_finetune/scripts/train.py +++ b/llm_finetune/scripts/train.py @@ -21,10 +21,13 @@ from transformers import HfArgumentParser, TrainingArguments, Trainer from transformers.modeling_utils import unwrap_model from mlperf_logging_utils import MLPerfCallback,LoraLogger,submission_info,general_info,optimization_info +from datasets import load_dataset +import numpy as np +import functools from utils import ( create_and_prepare_model, - create_datasets, world_size_from_yaml, + training_step, SaveDeepSpeedPeftModelCallback, peft_module_casting_to_bf16, ) @@ -63,9 +66,9 @@ class ScriptArguments: }, ) - dataset_name: Optional[str] = field( - default="tau/scrolls", - metadata={"help": "The preference dataset to use."}, + dataset_path: Optional[str] = field( + default='./dataset.npy', + metadata={"help": "The path to the downloaded dataset."}, ) config_path: Optional[str] = field( default="./configs/default_config.yaml", @@ -160,6 +163,7 @@ def main(args): submission_poc_name="referece", submission_poc_email="referece", submission_status="referece") + # training arguments is_deepspeed_peft_enabled = ( os.environ.get("ACCELERATE_USE_DEEPSPEED", "False").lower() == "true" @@ -196,7 +200,11 @@ def main(args): model.config.use_cache = False # datasets - train_dataset, eval_dataset = create_datasets(tokenizer, args) + #dataset = load_dataset("regisss/scrolls_gov_report_preprocessed_mlperf_2") + dataset = np.load(args.dataset_path,allow_pickle=True).tolist() + train_dataset, eval_dataset = dataset["train"], dataset["validation"] + #train_dataset, eval_dataset = create_datasets(tokenizer, args) + world_size = world_size_from_yaml(args.config_path) general_info(loralogger,args,world_size=world_size,eval_samples=len(eval_dataset),train_samples=len(train_dataset)) optimization_info(loralogger,args) @@ -210,6 +218,7 @@ def main(args): eval_dataset=eval_dataset, callbacks=[MLPerfCallback(loralogger)], ) + trainer.training_step = functools.partial(training_step, trainer) trainer.accelerator.print(f"{trainer.model}") if args.use_peft_lora: trainer.model.print_trainable_parameters() diff --git a/llm_finetune/scripts/utils.py b/llm_finetune/scripts/utils.py index a5c211e23..53989510c 100644 --- a/llm_finetune/scripts/utils.py +++ b/llm_finetune/scripts/utils.py @@ -6,6 +6,8 @@ import warnings from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training import yaml +from deepspeed.accelerator import get_accelerator + from peft.tuners.lora import LoraLayer from transformers import ( @@ -17,9 +19,27 @@ TrainerState, TrainerControl, ) +from transformers.utils import ( + is_sagemaker_mp_enabled, + is_apex_available, + #get_accelerator, +) +if is_apex_available(): + from apex import amp + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp + from smdistributed.modelparallel import __version__ as SMP_VERSION + + IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10") + + from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat +else: + IS_SAGEMAKER_MP_POST_1_10 = False + from itertools import chain from functools import partial - +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union class SaveDeepSpeedPeftModelCallback(TrainerCallback): def __init__(self, trainer, save_steps=500): @@ -257,7 +277,7 @@ def create_and_prepare_model(args): device_map = None model = AutoModelForCausalLM.from_pretrained( - args.model_name, + 'regisss/llama2-70b-fused-qkv-mlperf', device_map=device_map, use_cache=not args.use_gradient_checkpointing, trust_remote_code=True, @@ -287,6 +307,71 @@ def create_and_prepare_model(args): return model, peft_config, tokenizer +def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: + """ + Perform a training step on a batch of inputs. + Subclass and override to inject custom behavior. + Args: + model (`nn.Module`): + The model to train. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + Return: + `torch.Tensor`: The tensor with training loss on this batch. + """ + model.train() + inputs = self._prepare_inputs(inputs) + if is_sagemaker_mp_enabled(): + loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps) + return loss_mb.reduce_mean().detach().to(self.args.device) + with self.compute_loss_context_manager(): + loss = self.compute_loss(model, inputs) + if self.args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if self.use_apex: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + self.accelerator.backward(loss) + get_accelerator().empty_cache() + return loss.detach() / self.args.gradient_accumulation_steps + + +def create_and_prepare_model_unfuse(args): + device_map = None + + model = AutoModelForCausalLM.from_pretrained( + args.model_name, + device_map=device_map, + use_cache=not args.use_gradient_checkpointing, + trust_remote_code=True, + use_flash_attention_2=True if args.use_flash_attn else False, + torch_dtype=torch.bfloat16, + ) + + peft_config = None + if args.use_peft_lora: + peft_config = LoraConfig( + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + bias="none", + task_type="CAUSAL_LM", + target_modules=None + if args.lora_target_modules is None + else args.lora_target_modules.split(","), + ) + if args.use_gradient_checkpointing: + model.gradient_checkpointing_enable() + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) + tokenizer.pad_token = tokenizer.eos_token + + return model, peft_config, tokenizer def peft_module_casting_to_bf16(model, args): for name, module in model.named_modules(): From 216516317ffa0a5f2db43e92cfe0ccbc49d4e0ac Mon Sep 17 00:00:00 2001 From: itayhubara Date: Wed, 21 Feb 2024 18:35:22 +0200 Subject: [PATCH 08/16] fixing dataset and model links and updating bash script and readme --- llm_finetune/README.md | 8 ++++++-- llm_finetune/run_llama_70B_scrolls_r16.sh | 20 +++++++++++--------- llm_finetune/scripts/train.py | 16 ++++++++++++---- llm_finetune/scripts/utils.py | 2 +- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/llm_finetune/README.md b/llm_finetune/README.md index 2ee454b01..2caddad2f 100644 --- a/llm_finetune/README.md +++ b/llm_finetune/README.md @@ -40,8 +40,12 @@ Finally please install mlperf logger: git clone https://github.com/mlperf/logging.git mlperf-logging pip install -e mlperf-logging ``` -## Download Data -data can be downloaded from [mlperf drive](https://drive.google.com/drive/folders/1sfnK9m5FSQrWMqI2dajNTX2dxlJegR94) +## Download Data and Model +data can be downloaded from: +[mlperf drive - train data](https://drive.google.com/file/d/1-JgY1mEafcJ7qhggt6UR3OEKAciIPd5s/view?usp=sharing) +[mlperf drive - validation data](https://drive.google.com/file/d/1jrm6Lacrq49AYv0uB_Qy22xRmfPixQvs/view?usp=sharing) +[mlperf drive - llama-v2 model](https://drive.google.com/drive/folders/1sTeuxkPhwkNPKIPFnOLIYCcK53oB3Ypc?usp=sharing) +As defaults the scripts assume the model is under at ```./llama-v2-fused-qkv``` and the both train and validation are under ```dataset``` folder. ## Llama2-70B on 8 devices diff --git a/llm_finetune/run_llama_70B_scrolls_r16.sh b/llm_finetune/run_llama_70B_scrolls_r16.sh index e1cb15f37..759b9aeb4 100644 --- a/llm_finetune/run_llama_70B_scrolls_r16.sh +++ b/llm_finetune/run_llama_70B_scrolls_r16.sh @@ -1,24 +1,26 @@ accelerate launch --config_file configs/default_config.yaml scripts/train.py \ --model_name meta-llama/Llama-2-70b-hf \ ---dataset_name "tau/scrolls" --dataset_config_name "gov_report" \ +--dataset_path "./dataset" \ +--model_path "./llama-v2-fused-qkv" \ --max_seq_len 8192 \ --bf16 True \ ---logging_steps 6 \ ---eval_steps 24 \ ---save_steps 24 \ +--logging_steps 2 \ +--eval_steps 6 \ +--save_steps 999 \ --output_dir "./results/llama-70b_scrolls_gov_report_r16_$1" \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ --dataset_text_field "input" \ --lr_scheduler_type "cosine" \ ---learning_rate 7e-4 \ ---warmup_ratio 0.03 \ +--learning_rate 5e-4 \ +--warmup_ratio 0 \ --use_gradient_checkpointing True \ +--target_eval_loss 0.925 \ --use_peft_lora True \ --lora_r 16 \ ---lora_alpha 32 \ +--lora_alpha 16 \ --lora_dropout 0.1 \ ---max_steps 288 \ +--max_steps 800 \ --use_flash_attn \ --seed "$1" \ ---lora_target_modules "q_proj,v_proj,k_proj,o_proj" \ No newline at end of file +--lora_target_modules "qkv_proj,o_proj" diff --git a/llm_finetune/scripts/train.py b/llm_finetune/scripts/train.py index 4ad8275b1..a5c17ac20 100644 --- a/llm_finetune/scripts/train.py +++ b/llm_finetune/scripts/train.py @@ -65,7 +65,12 @@ class ScriptArguments: "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc." }, ) - + model_path: Optional[str] = field( + default='./llama-v2-fused-qkv', + metadata={ + "help": "Path to the model directory." + }, + ) dataset_path: Optional[str] = field( default='./dataset.npy', metadata={"help": "The path to the downloaded dataset."}, @@ -200,10 +205,13 @@ def main(args): model.config.use_cache = False # datasets - #dataset = load_dataset("regisss/scrolls_gov_report_preprocessed_mlperf_2") - dataset = np.load(args.dataset_path,allow_pickle=True).tolist() + ## ToDo uncomment once drive goes public + #train_url = "https://drive.google.com/file/d/1-JgY1mEafcJ7qhggt6UR3OEKAciIPd5s/view?usp=sharing" + #eval_url = "https://drive.google.com/file/d/1jrm6Lacrq49AYv0uB_Qy22xRmfPixQvs/view?usp=sharing" + #dataset = load_dataset("parquet", data_files={'train': train_url, 'validation': eval_url}) + dataset = load_dataset("parquet", data_files={'train': 'dataset/train-00000-of-00001.parquet', 'validation': 'dataset/validation-00000-of-00001.parquet'}) train_dataset, eval_dataset = dataset["train"], dataset["validation"] - #train_dataset, eval_dataset = create_datasets(tokenizer, args) + world_size = world_size_from_yaml(args.config_path) general_info(loralogger,args,world_size=world_size,eval_samples=len(eval_dataset),train_samples=len(train_dataset)) diff --git a/llm_finetune/scripts/utils.py b/llm_finetune/scripts/utils.py index 53989510c..771c9753e 100644 --- a/llm_finetune/scripts/utils.py +++ b/llm_finetune/scripts/utils.py @@ -277,7 +277,7 @@ def create_and_prepare_model(args): device_map = None model = AutoModelForCausalLM.from_pretrained( - 'regisss/llama2-70b-fused-qkv-mlperf', + args.model_path, device_map=device_map, use_cache=not args.use_gradient_checkpointing, trust_remote_code=True, From efdcd1865beb4ab370a6ff99d3411488347ca081 Mon Sep 17 00:00:00 2001 From: Michal Futrega Date: Thu, 22 Feb 2024 16:04:33 +0100 Subject: [PATCH 09/16] Fix eval batch size, add Dockerfile, improve logging, remove unused code --- llm_finetune/requirements.txt | 9 - llm_finetune/scripts/eval.py | 190 --------- llm_finetune/scripts/mlperf_logging_utils.py | 119 ------ llm_finetune/scripts/utils.py | 386 ------------------ llm_finetuning/Dockerfile | 10 + {llm_finetune => llm_finetuning}/README.md | 0 .../configs/default_config.yaml | 0 .../convergence_example.txt | 0 llm_finetuning/requirements.txt | 5 + .../run_docker.sh | 0 .../run_llama_70B_scrolls_r16.sh | 0 .../scripts/mlperf_logging_utils.py | 190 +++++++++ .../scripts/train.py | 101 ++--- llm_finetuning/scripts/utils.py | 231 +++++++++++ 14 files changed, 464 insertions(+), 777 deletions(-) delete mode 100644 llm_finetune/requirements.txt delete mode 100644 llm_finetune/scripts/eval.py delete mode 100644 llm_finetune/scripts/mlperf_logging_utils.py delete mode 100644 llm_finetune/scripts/utils.py create mode 100644 llm_finetuning/Dockerfile rename {llm_finetune => llm_finetuning}/README.md (100%) rename {llm_finetune => llm_finetuning}/configs/default_config.yaml (100%) rename {llm_finetune => llm_finetuning}/convergence_example.txt (100%) create mode 100644 llm_finetuning/requirements.txt rename {llm_finetune => llm_finetuning}/run_docker.sh (100%) rename {llm_finetune => llm_finetuning}/run_llama_70B_scrolls_r16.sh (100%) create mode 100644 llm_finetuning/scripts/mlperf_logging_utils.py rename {llm_finetune => llm_finetuning}/scripts/train.py (71%) create mode 100644 llm_finetuning/scripts/utils.py diff --git a/llm_finetune/requirements.txt b/llm_finetune/requirements.txt deleted file mode 100644 index f4f18be52..000000000 --- a/llm_finetune/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -transformers -accelerate -peft -datasets -deepspeed -bitsandbytes -evaluate -nltk -rouge-score diff --git a/llm_finetune/scripts/eval.py b/llm_finetune/scripts/eval.py deleted file mode 100644 index b5dd150a4..000000000 --- a/llm_finetune/scripts/eval.py +++ /dev/null @@ -1,190 +0,0 @@ -import argparse -import torch -from dataclasses import dataclass -from torch.utils.data import DataLoader -from transformers import AutoTokenizer, AutoModelForCausalLM -from peft import AutoPeftModelForCausalLM -from peft.config import PeftConfigMixin -from datasets import load_dataset -import evaluate -import nltk -import numpy as np -from tqdm import tqdm -from typing import Any, Dict, List, Union, Optional -from transformers.tokenization_utils_base import PreTrainedTokenizerBase - -nltk.download("punkt") - -# Arguments management -parser = argparse.ArgumentParser() -parser.add_argument( - "--model_name", - default=None, - type=str, - help="Path to pre-trained model (on the HF Hub or locally).", -) -parser.add_argument( - "--peft_model_name", - default=None, - type=str, - help="Path to PEFT model (on the HF Hub or locally).", -) -parser.add_argument( - "--max_new_tokens", type=int, default=300, help="Number of tokens to generate." -) -parser.add_argument("--seq_length", type=int, default=8192, help="Sequence length.") -parser.add_argument("--do_sample", action="store_true", help="Wheter to generate doing multinomial sampling.") -parser.add_argument("--dataset_name", type=str, default="tau/scrolls", help= "The preference dataset to use.") -parser.add_argument("--dataset_config_name", type=str, default="gov_report", help= "The preference dataset config to use.") -args = parser.parse_args() - -# Instantiate model -if args.peft_model_name is not None: - model = ( - AutoPeftModelForCausalLM.from_pretrained( - args.peft_model_name, - device_map="auto", - torch_dtype=torch.bfloat16, - trust_remote_code=True, - ) - .merge_and_unload() - .eval() - ) - base_model_name = PeftConfigMixin.from_pretrained( - args.peft_model_name - ).base_model_name_or_path - tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) -else: - model = AutoModelForCausalLM.from_pretrained( - args.model_name, - device_map="auto", - torch_dtype=torch.bfloat16, - trust_remote_code=True, - ).eval() - tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) - -model.generation_config.pad_token_id = model.generation_config.eos_token_id -tokenizer.pad_token = tokenizer.eos_token - -# Load dataset -dataset = load_dataset( - args.dataset_name, - args.dataset_config_name, - use_auth_token=True, - num_proc=4, - split="validation" - ) -column_names = dataset.features - -def tokenize_function(examples): - output_texts = [] - for i in range(len(examples["input"])): - output_texts.append( - f"### Summarize the following text:\n {examples['input'][i]}\n ### Summary:\n " - ) - input_ids = tokenizer(output_texts).input_ids - - return {"input_ids": input_ids, "ground_truth": examples["output"]} - - -test_dataset = dataset.map( - tokenize_function, - batched=True, - num_proc=2, - remove_columns=column_names, -) - - -def filter_function(examples): - to_keep = [] - for i in range(len(examples["input_ids"])): - if len(examples["input_ids"][i]) > args.seq_length - args.max_new_tokens: - to_keep.append(False) - else: - to_keep.append(True) - return to_keep - - -test_dataset = test_dataset.filter( - filter_function, - batched=True, - num_proc=2, -) -print(f"Size of the test set: {len(test_dataset)}.") - - -@dataclass -class CustomDataCollator: - tokenizer: PreTrainedTokenizerBase - padding: Union[bool, str] = True - max_length: Optional[int] = None - pad_to_multiple_of: Optional[int] = None - return_tensors: str = "pt" - - def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: - input_ids = [{"input_ids": sample["input_ids"]} for sample in features] - batch = self.tokenizer.pad( - input_ids, - padding=self.padding, - max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors=self.return_tensors, - ) - batch["ground_truth"] = [sample["ground_truth"] for sample in features] - return batch - - -dataloader = DataLoader( - test_dataset, - batch_size=1, - collate_fn=CustomDataCollator(tokenizer), -) - - -def postprocess_text(preds, labels): - preds = [pred.strip() for pred in preds] - labels = [label.strip() for label in labels] - - # rougeLSum expects newline after each sentence - preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] - labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] - - return preds, labels - - -metric = evaluate.load("rouge") - - -def compute_metrics(generated, ground_truth): - # Some simple post-processing - decoded_preds, decoded_labels = postprocess_text(generated, ground_truth) - result = metric.compute( - predictions=decoded_preds, references=decoded_labels, use_stemmer=True - ) - result = {k: round(v * 100, 4) for k, v in result.items()} - prediction_lens = [ - np.count_nonzero(gen != tokenizer.pad_token_id) for gen in generated - ] - result["gen_len"] = np.mean(prediction_lens) - return result - - -generated_sequences = [] -ground_truths = [] -for batch in tqdm(dataloader): - outputs = model.generate( - inputs=batch["input_ids"].to("cuda"),do_sample=args.do_sample , max_new_tokens=args.max_new_tokens - ) - outputs = [ - output.split("### Summary:\n ")[-1] - for output in tokenizer.batch_decode(outputs, skip_special_tokens=True) - ] - - print("Batch outputs:", outputs) - print("Batch ground truths:", batch["ground_truth"]) - generated_sequences += outputs - ground_truths += batch["ground_truth"] - print("Current results:", compute_metrics(generated_sequences, ground_truths)) - -res = compute_metrics(generated_sequences, ground_truths) -print("Final results:", res) diff --git a/llm_finetune/scripts/mlperf_logging_utils.py b/llm_finetune/scripts/mlperf_logging_utils.py deleted file mode 100644 index b752473ac..000000000 --- a/llm_finetune/scripts/mlperf_logging_utils.py +++ /dev/null @@ -1,119 +0,0 @@ -import os -from mlperf_logging import mllog -from mlperf_logging.mllog import constants -import torch -import torch.distributed as dist -from transformers import TrainingArguments, TrainerCallback, TrainerState, TrainerControl - - - -def is_dist_avail_and_initialized(): - if not dist.is_available(): - return False - if not dist.is_initialized(): - return False - return True - -def get_rank(): - if not is_dist_avail_and_initialized(): - return 0 - return dist.get_rank() - -def barrier(): - if not is_dist_avail_and_initialized(): - return - torch.distributed.barrier() - -class LoraLogger: - def __init__(self,target_eval_loss=None, filename=None, default_stack_offset=2): - self.mllogger = mllog.get_mllogger() - mllog.config(default_stack_offset=default_stack_offset, - filename=(filename or os.getenv("COMPLIANCE_FILE") or "mlperf_compliance.log"), - root_dir=os.path.normpath(os.path.dirname(os.path.realpath(__file__)))) - self.target_eval_loss=target_eval_loss - - @property - def rank(self): - return get_rank() - - def event(self, key, value=None, metadata=None, sync=False, log_rank=None): - log_rank = self.rank==0 if log_rank is None else log_rank - if sync: - barrier() - if log_rank: - self.mllogger.event(key=key, value=value, metadata=metadata) - - def start(self, key, value=None, metadata=None, sync=False, log_rank=None): - log_rank = self.rank==0 if log_rank is None else log_rank - if sync: - barrier() - if log_rank: - self.mllogger.start(key=key, value=value, metadata=metadata) - - def end(self, key, value=None, metadata=None, sync=False, log_rank=None): - log_rank = self.rank==0 if log_rank is None else log_rank - if sync: - barrier() - if log_rank: - self.mllogger.end(key=key, value=value, metadata=metadata) - -def submission_info(mllogger: LoraLogger, - submission_benchmark: str, submission_division: str, submission_org: str, submission_platform: str, - submission_poc_name: str, submission_poc_email: str, submission_status: str): - """Logs required for a valid MLPerf submission.""" - if mllogger.rank==0: - mllogger.event(key=constants.SUBMISSION_BENCHMARK, value=submission_benchmark) - mllogger.event(key=constants.SUBMISSION_DIVISION, value=submission_division) - mllogger.event(key=constants.SUBMISSION_ORG, value=submission_org) - mllogger.event(key=constants.SUBMISSION_PLATFORM, value=submission_platform) - mllogger.event(key=constants.SUBMISSION_POC_NAME, value=submission_poc_name) - mllogger.event(key=constants.SUBMISSION_POC_EMAIL, value=submission_poc_email) - mllogger.event(key=constants.SUBMISSION_STATUS, value=submission_status) - -def general_info(mllogger: LoraLogger,args,world_size,eval_samples,train_samples): - if mllogger.rank==0: - mllogger.event(key=constants.GLOBAL_BATCH_SIZE, value=args.per_device_train_batch_size*args.gradient_accumulation_steps*world_size) - mllogger.event(key=constants.TRAIN_SAMPLES, value=train_samples) - mllogger.event(key=constants.EVAL_SAMPLES, value=eval_samples) - mllogger.event(key=constants.SEED, value=args.seed) - -def optimization_info(mllogger: LoraLogger,args): - """Logs required for a valid MLPerf submission.""" - if mllogger.rank==0: - mllogger.event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_ratio) - mllogger.event(key=constants.OPT_LR_TRAINING_STEPS, value=args.max_steps) - mllogger.event(key=constants.OPT_BASE_LR, value=args.learning_rate) - #mllogger.event(key=constants.OPT_LAMB_BETA_1, value=args.beta1) - -class MLPerfCallback(TrainerCallback): - "A callback that prints a message at the beginning of training" - def __init__(self,logger): - super().__init__() - self.mllogger = logger - - def on_train_begin(self, args, state, control, **kwargs): - self.mllogger.start(constants.RUN_START,value='') - - def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): - """ - Event called at the beginning of a training step. If using gradient accumulation, one training step might take - several inputs. - """ - if state.global_step % (state.logging_steps) == 0 and state.global_step > 0 and not state.global_step % (state.eval_steps) == 0: - self.mllogger.event('train_loss',value=state.log_history[-1]['loss'],metadata={"steps":state.log_history[-1]['step']}) - control.should_log = True - - if state.global_step % (state.eval_steps) == 0 and state.global_step>0: - self.mllogger.event('eval_loss',value=state.log_history[-1]['eval_loss'],metadata={"steps":state.log_history[-1]['step']}) - control.should_log = True - - eval_loss_list=[sl['eval_loss'] for sl in state.log_history if 'eval_loss' in sl] - if eval_loss_list and eval_loss_list[-1]<=self.mllogger.target_eval_loss: - control.should_training_stop = True - self.mllogger.end(constants.RUN_STOP,value=eval_loss_list[-1],metadata={"steps":state.log_history[-1]['step'],"status": 'success'}) - if state.global_step >= state.max_steps: - control.should_training_stop = True - self.mllogger.end(constants.RUN_STOP,value=eval_loss_list[-1],metadata={"steps":state.log_history[-1]['step'],"status": 'fail'}) - - return control - diff --git a/llm_finetune/scripts/utils.py b/llm_finetune/scripts/utils.py deleted file mode 100644 index 771c9753e..000000000 --- a/llm_finetune/scripts/utils.py +++ /dev/null @@ -1,386 +0,0 @@ -import random -import torch -from torch.utils.data import IterableDataset -from datasets import load_dataset -from tqdm import tqdm -import warnings -from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training -import yaml -from deepspeed.accelerator import get_accelerator - - -from peft.tuners.lora import LoraLayer -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - BitsAndBytesConfig, - TrainerCallback, - TrainingArguments, - TrainerState, - TrainerControl, -) -from transformers.utils import ( - is_sagemaker_mp_enabled, - is_apex_available, - #get_accelerator, -) -if is_apex_available(): - from apex import amp - -if is_sagemaker_mp_enabled(): - import smdistributed.modelparallel.torch as smp - from smdistributed.modelparallel import __version__ as SMP_VERSION - - IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10") - - from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat -else: - IS_SAGEMAKER_MP_POST_1_10 = False - -from itertools import chain -from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union - -class SaveDeepSpeedPeftModelCallback(TrainerCallback): - def __init__(self, trainer, save_steps=500): - self.trainer = trainer - self.save_steps = save_steps - - def on_step_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if (state.global_step + 1) % self.save_steps == 0: - self.trainer.accelerator.wait_for_everyone() - state_dict = self.trainer.accelerator.get_state_dict(self.trainer.deepspeed) - unwrapped_model = self.trainer.accelerator.unwrap_model( - self.trainer.deepspeed - ) - if self.trainer.accelerator.is_main_process: - unwrapped_model.save_pretrained(args.output_dir, state_dict=state_dict) - self.trainer.accelerator.wait_for_everyone() - return control - - -class ConstantLengthDataset(IterableDataset): - """ - Iterable dataset that returns constant length chunks of tokens from stream of text files. - Args: - tokenizer (Tokenizer): The processor used for proccessing the data. - dataset (dataset.Dataset): Dataset with text files. - infinite (bool): If True the iterator is reset after dataset reaches end else stops. - seq_length (int): Length of token sequences to return. - num_of_sequences (int): Number of token sequences to keep in buffer. - chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer. - shuffle (bool): If true, the samples in each buffer are suffled. Default is `True`. - add_eos_token (bool): If true, each buffer is delimited with eos token. Default is `True`. - """ - - def __init__( - self, - tokenizer, - dataset, - infinite=False, - seq_length=1024, - num_of_sequences=1024, - chars_per_token=3.6, - content_field="content", - shuffle=True, - add_eos_token=True, - ): - self.tokenizer = tokenizer - self.concat_token_id = tokenizer.eos_token_id - self.dataset = dataset - self.seq_length = seq_length - self.infinite = infinite - self.current_size = 0 - self.max_buffer_size = seq_length * chars_per_token * num_of_sequences - self.content_field = content_field - self.shuffle = shuffle - self.add_eos_token = add_eos_token - - def __iter__(self): - iterator = iter(self.dataset) - more_examples = True - while more_examples: - buffer, buffer_len = [], 0 - while True: - if buffer_len >= self.max_buffer_size: - break - try: - buffer.append(next(iterator)[self.content_field]) - buffer_len += len(buffer[-1]) - except StopIteration: - if self.infinite: - iterator = iter(self.dataset) - else: - more_examples = False - break - - tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"] - all_token_ids = [] - for tokenized_input in tokenized_inputs: - if self.add_eos_token: - tokenized_input = tokenized_input + [self.concat_token_id] - all_token_ids.extend(tokenized_input) - examples = [] - for i in range(0, len(all_token_ids), self.seq_length): - input_ids = all_token_ids[i : i + self.seq_length] - if len(input_ids) == self.seq_length: - examples.append(input_ids) - if self.shuffle: - random.shuffle(examples) - for example in examples: - self.current_size += 1 - yield { - "input_ids": torch.LongTensor(example), - "labels": torch.LongTensor(example), - } - - -def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400): - """ - Estimate the average number of characters per token in the dataset. - """ - total_characters, total_tokens = 0, 0 - for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): - total_characters += len(example[data_column]) - total_tokens += len(tokenizer(example[data_column]).tokens()) - - return total_characters / total_tokens - -def group_texts(examples, block_size): - # Concatenate all texts. - concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. - # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - if 'labels' not in result: - result["labels"] = result["input_ids"].copy() - return result - - -def create_datasets(tokenizer, args): - dataset = load_dataset( - args.dataset_name, - args.dataset_config_name, - use_auth_token=True, - num_proc=args.num_workers, - ) - train_dataset = dataset["train"] - valid_dataset = dataset["validation"] - column_names = train_dataset.features - - def tokenize_function(example,eval=False): - output_texts = [] - mask_labels_sizes=[] - for i in range(len(example["input"])): - if 'gov_report' in args.dataset_config_name: - output_texts.append( - f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n {example['output'][i]}{tokenizer.eos_token}" - ) - if eval: - mask_labels_sizes.append(f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n") - else: - output_texts.append( - f"### {example['input'][i]}\n ### The answer is:\n {example['output'][i]}{tokenizer.eos_token}" - ) - - input_ids = tokenizer(output_texts).input_ids - - if eval: - labels_ids = tokenizer(mask_labels_sizes).input_ids - masked_labels=[] - for out,lb in zip(input_ids,labels_ids): - ml=out.copy() - ml[:len(lb)]=[-100]*len(lb) - ml[-1]=-100 - masked_labels.append(ml) - return {"input_ids": input_ids,"labels": masked_labels} - else: - return {"input_ids": input_ids} - - train_dataset = train_dataset.map( - tokenize_function, - batched=True, - num_proc=8, - remove_columns=column_names, - ) - valid_dataset = valid_dataset.map( - partial(tokenize_function,eval=True), - batched=True, - num_proc=2, - remove_columns=column_names, - ) - - def filter_function(example): - to_keep = [] - for i in range(len(example["input_ids"])): - if len(example["input_ids"][i]) > args.max_seq_length: - to_keep.append(False) - else: - to_keep.append(True) - return to_keep - - train_dataset = train_dataset.filter( - filter_function, - batched=True, - # with_indices=True, - num_proc=8, - # remove_columns=column_names, - ) - valid_dataset = valid_dataset.filter( - filter_function, - batched=True, - # with_indices=True, - num_proc=2, - # remove_columns=column_names, - ) - print( - f"Before packing, Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" - ) - - packing_method = partial(group_texts, block_size=args.max_seq_length) - # Packing - train_dataset = train_dataset.map( - packing_method, - batched=True, - num_proc=8, - ) - valid_dataset = valid_dataset.map( - packing_method, - batched=True, - num_proc=2, - ) - - print( - f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" - ) - - return train_dataset, valid_dataset - -def world_size_from_yaml(yaml_path): - with open(yaml_path, 'r') as f: - data = yaml.load(f, Loader=yaml.SafeLoader) - return data['num_machines']*data['num_processes'] - -def create_and_prepare_model(args): - device_map = None - - model = AutoModelForCausalLM.from_pretrained( - args.model_path, - device_map=device_map, - use_cache=not args.use_gradient_checkpointing, - trust_remote_code=True, - use_flash_attention_2=True if args.use_flash_attn else False, - torch_dtype=torch.bfloat16, - ) - - peft_config = None - if args.use_peft_lora: - peft_config = LoraConfig( - lora_alpha=args.lora_alpha, - lora_dropout=args.lora_dropout, - r=args.lora_r, - bias="none", - task_type="CAUSAL_LM", - target_modules=None - if args.lora_target_modules is None - else args.lora_target_modules.split(","), - ) - if args.use_gradient_checkpointing: - model.gradient_checkpointing_enable() - model = get_peft_model(model, peft_config) - model.print_trainable_parameters() - - tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) - tokenizer.pad_token = tokenizer.eos_token - - return model, peft_config, tokenizer - -def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: - """ - Perform a training step on a batch of inputs. - Subclass and override to inject custom behavior. - Args: - model (`nn.Module`): - The model to train. - inputs (`Dict[str, Union[torch.Tensor, Any]]`): - The inputs and targets of the model. - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument `labels`. Check your model's documentation for all accepted arguments. - Return: - `torch.Tensor`: The tensor with training loss on this batch. - """ - model.train() - inputs = self._prepare_inputs(inputs) - if is_sagemaker_mp_enabled(): - loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps) - return loss_mb.reduce_mean().detach().to(self.args.device) - with self.compute_loss_context_manager(): - loss = self.compute_loss(model, inputs) - if self.args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if self.use_apex: - with amp.scale_loss(loss, self.optimizer) as scaled_loss: - scaled_loss.backward() - else: - self.accelerator.backward(loss) - get_accelerator().empty_cache() - return loss.detach() / self.args.gradient_accumulation_steps - - -def create_and_prepare_model_unfuse(args): - device_map = None - - model = AutoModelForCausalLM.from_pretrained( - args.model_name, - device_map=device_map, - use_cache=not args.use_gradient_checkpointing, - trust_remote_code=True, - use_flash_attention_2=True if args.use_flash_attn else False, - torch_dtype=torch.bfloat16, - ) - - peft_config = None - if args.use_peft_lora: - peft_config = LoraConfig( - lora_alpha=args.lora_alpha, - lora_dropout=args.lora_dropout, - r=args.lora_r, - bias="none", - task_type="CAUSAL_LM", - target_modules=None - if args.lora_target_modules is None - else args.lora_target_modules.split(","), - ) - if args.use_gradient_checkpointing: - model.gradient_checkpointing_enable() - model = get_peft_model(model, peft_config) - model.print_trainable_parameters() - - tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) - tokenizer.pad_token = tokenizer.eos_token - - return model, peft_config, tokenizer - -def peft_module_casting_to_bf16(model, args): - for name, module in model.named_modules(): - if isinstance(module, LoraLayer): - if args.bf16: - module = module.to(torch.bfloat16) - if "norm" in name: - module = module.to(torch.float32) - if any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]): - if hasattr(module, "weight"): - if args.bf16 and module.weight.dtype == torch.float32: - module = module.to(torch.bfloat16) diff --git a/llm_finetuning/Dockerfile b/llm_finetuning/Dockerfile new file mode 100644 index 000000000..e56eb8960 --- /dev/null +++ b/llm_finetuning/Dockerfile @@ -0,0 +1,10 @@ +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.01-py3 +FROM ${FROM_IMAGE_NAME} + +WORKDIR /workspace/ft-llm +ADD . /workspace/ft-llm + +ENV OMP_NUM_THREADS=8 + +RUN pip install -r requirements.txt +RUN pip install flash-attn==2.4.1 --no-build-isolation diff --git a/llm_finetune/README.md b/llm_finetuning/README.md similarity index 100% rename from llm_finetune/README.md rename to llm_finetuning/README.md diff --git a/llm_finetune/configs/default_config.yaml b/llm_finetuning/configs/default_config.yaml similarity index 100% rename from llm_finetune/configs/default_config.yaml rename to llm_finetuning/configs/default_config.yaml diff --git a/llm_finetune/convergence_example.txt b/llm_finetuning/convergence_example.txt similarity index 100% rename from llm_finetune/convergence_example.txt rename to llm_finetuning/convergence_example.txt diff --git a/llm_finetuning/requirements.txt b/llm_finetuning/requirements.txt new file mode 100644 index 000000000..474a3e297 --- /dev/null +++ b/llm_finetuning/requirements.txt @@ -0,0 +1,5 @@ +transformers +accelerate +peft +datasets +deepspeed \ No newline at end of file diff --git a/llm_finetune/run_docker.sh b/llm_finetuning/run_docker.sh similarity index 100% rename from llm_finetune/run_docker.sh rename to llm_finetuning/run_docker.sh diff --git a/llm_finetune/run_llama_70B_scrolls_r16.sh b/llm_finetuning/run_llama_70B_scrolls_r16.sh similarity index 100% rename from llm_finetune/run_llama_70B_scrolls_r16.sh rename to llm_finetuning/run_llama_70B_scrolls_r16.sh diff --git a/llm_finetuning/scripts/mlperf_logging_utils.py b/llm_finetuning/scripts/mlperf_logging_utils.py new file mode 100644 index 000000000..f8b28035c --- /dev/null +++ b/llm_finetuning/scripts/mlperf_logging_utils.py @@ -0,0 +1,190 @@ +import os + +import torch +import torch.distributed as dist +from mlperf_logging import mllog +from mlperf_logging.mllog import constants +from transformers import ( + TrainerCallback, + TrainerControl, + TrainerState, + TrainingArguments, +) + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def barrier(): + if not is_dist_avail_and_initialized(): + return + torch.distributed.barrier() + + +class LoraLogger: + def __init__(self, target_eval_loss=None, filename=None, default_stack_offset=2): + self.mllogger = mllog.get_mllogger() + mllog.config( + default_stack_offset=default_stack_offset, + filename=( + filename or os.getenv("COMPLIANCE_FILE") or "mlperf_compliance.log" + ), + root_dir=os.path.normpath(os.path.dirname(os.path.realpath(__file__))), + ) + self.target_eval_loss = target_eval_loss + + @property + def rank(self): + return get_rank() + + def event(self, key, value=None, metadata=None, sync=False, log_rank=None): + log_rank = self.rank == 0 if log_rank is None else self.rank == log_rank + if sync: + barrier() + if log_rank: + self.mllogger.event(key=key, value=value, metadata=metadata) + + def start(self, key, value=None, metadata=None, sync=False, log_rank=None): + log_rank = self.rank == 0 if log_rank is None else self.rank == log_rank + if sync: + barrier() + if log_rank: + self.mllogger.start(key=key, value=value, metadata=metadata) + + def end(self, key, value=None, metadata=None, sync=False, log_rank=None): + log_rank = self.rank == 0 if log_rank is None else self.rank == log_rank + if sync: + barrier() + if log_rank: + self.mllogger.end(key=key, value=value, metadata=metadata) + + +class MLPerfCallback(TrainerCallback): + "A callback that prints a message at the beginning of training" + + def __init__(self, logger, train_dataset_length, eval_dataset_length): + super().__init__() + self.mllogger = logger + self.submission_info = { + "submission_benchmark": "llm-finetuning", + "submission_division": "Closed", + "submission_org": "referece", + "submission_platform": "referece", + "submission_poc_name": "referece", + "submission_poc_email": "referece", + "submission_status": "referece", + "train_dataset_length": train_dataset_length, + "eval_dataset_length": eval_dataset_length, + } + + def on_train_begin(self, args, state, control, **kwargs): + self.mllogger.event( + key=constants.SUBMISSION_BENCHMARK, + value=self.submission_info["submission_benchmark"], + ) + self.mllogger.event( + key=constants.SUBMISSION_DIVISION, + value=self.submission_info["submission_division"], + ) + self.mllogger.event( + key=constants.SUBMISSION_ORG, value=self.submission_info["submission_org"] + ) + self.mllogger.event( + key=constants.SUBMISSION_PLATFORM, + value=self.submission_info["submission_platform"], + ) + self.mllogger.event( + key=constants.SUBMISSION_POC_NAME, + value=self.submission_info["submission_poc_name"], + ) + self.mllogger.event( + key=constants.SUBMISSION_POC_EMAIL, + value=self.submission_info["submission_poc_email"], + ) + self.mllogger.event( + key=constants.SUBMISSION_STATUS, + value=self.submission_info["submission_status"], + ) + self.mllogger.event( + key=constants.GLOBAL_BATCH_SIZE, + value=args.per_device_train_batch_size + * args.gradient_accumulation_steps + * os.getenv("WORLD_SIZE", 1), + ) + self.mllogger.event( + key=constants.TRAIN_SAMPLES, + value=self.submission_info["train_dataset_length"], + ) + self.mllogger.event( + key=constants.EVAL_SAMPLES, + value=self.submission_info["eval_dataset_length"], + ) + self.mllogger.event(key=constants.SEED, value=args.seed) + self.mllogger.event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_ratio) + self.mllogger.event(key=constants.OPT_LR_TRAINING_STEPS, value=args.max_steps) + self.mllogger.event(key=constants.OPT_BASE_LR, value=args.learning_rate) + self.mllogger.start(constants.RUN_START, value="") + + def on_step_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + """ + Event called at the beginning of a training step. If using gradient accumulation, one training step might take + several inputs. + """ + if ( + state.global_step % (state.logging_steps) == 0 + and state.global_step > 0 + and not state.global_step % (state.eval_steps) == 0 + ): + self.mllogger.event( + "train_loss", + value=state.log_history[-1]["loss"], + metadata={"step_num": state.log_history[-1]["step"]}, + ) + control.should_log = True + + if state.global_step % (state.eval_steps) == 0 and state.global_step > 0: + self.mllogger.event( + "eval_loss", + value=state.log_history[-1]["eval_loss"], + metadata={"step_num": state.log_history[-1]["step"]}, + ) + control.should_log = True + eval_loss_list = [ + sl["eval_loss"] for sl in state.log_history if "eval_loss" in sl + ] + if eval_loss_list and eval_loss_list[-1] <= self.mllogger.target_eval_loss: + control.should_training_stop = True + self.mllogger.end( + constants.RUN_STOP, + value=eval_loss_list[-1], + metadata={ + "step_num": state.log_history[-1]["step"], + "status": "success", + }, + ) + if state.global_step >= state.max_steps: + control.should_training_stop = True + self.mllogger.end( + constants.RUN_STOP, + value=eval_loss_list[-1], + metadata={"step_num": state.log_history[-1]["step"], "status": "fail"}, + ) + + return control diff --git a/llm_finetune/scripts/train.py b/llm_finetuning/scripts/train.py similarity index 71% rename from llm_finetune/scripts/train.py rename to llm_finetuning/scripts/train.py index a5c17ac20..34e1d0849 100644 --- a/llm_finetune/scripts/train.py +++ b/llm_finetuning/scripts/train.py @@ -12,27 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +import functools from dataclasses import dataclass, field -from pathlib import Path -import os -import subprocess from typing import Optional -import os -from transformers import HfArgumentParser, TrainingArguments, Trainer -from transformers.modeling_utils import unwrap_model -from mlperf_logging_utils import MLPerfCallback,LoraLogger,submission_info,general_info,optimization_info + from datasets import load_dataset -import numpy as np -import functools -from utils import ( - create_and_prepare_model, - world_size_from_yaml, - training_step, - SaveDeepSpeedPeftModelCallback, - peft_module_casting_to_bf16, -) +from mlperf_logging_utils import LoraLogger, MLPerfCallback +from transformers import HfArgumentParser, Trainer, TrainingArguments +from utils import create_and_prepare_model, peft_module_casting_to_bf16, training_step + -# Define and parse arguments. @dataclass class ScriptArguments: """ @@ -66,19 +56,17 @@ class ScriptArguments: }, ) model_path: Optional[str] = field( - default='./llama-v2-fused-qkv', - metadata={ - "help": "Path to the model directory." - }, + default="./llama-v2-fused-qkv", + metadata={"help": "Path to the model directory."}, ) dataset_path: Optional[str] = field( - default='./dataset.npy', + default="./dataset.npy", metadata={"help": "The path to the downloaded dataset."}, ) config_path: Optional[str] = field( default="./configs/default_config.yaml", metadata={"help": "path to model config"}, - ) + ) num_train_epochs: Optional[int] = field( default=1, metadata={"help": "The number of training epochs for the reward model."}, @@ -159,25 +147,11 @@ class ScriptArguments: def main(args): - loralogger=LoraLogger(target_eval_loss=args.target_eval_loss) - submission_info(loralogger, - submission_benchmark="llm-finetuning", - submission_division="Closed", - submission_org="referece", - submission_platform="referece", - submission_poc_name="referece", - submission_poc_email="referece", - submission_status="referece") - - # training arguments - is_deepspeed_peft_enabled = ( - os.environ.get("ACCELERATE_USE_DEEPSPEED", "False").lower() == "true" - and args.use_peft_lora - ) - save_strategy = "steps" + loralogger = LoraLogger(target_eval_loss=args.target_eval_loss) training_arguments = TrainingArguments( output_dir=args.output_dir, per_device_train_batch_size=args.per_device_train_batch_size, + per_device_eval_batch_size=args.per_device_eval_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, optim=args.optim, learning_rate=args.learning_rate, @@ -188,7 +162,7 @@ def main(args): lr_scheduler_type=args.lr_scheduler_type, num_train_epochs=args.num_train_epochs, evaluation_strategy="steps", - save_strategy=save_strategy, + save_strategy="no", max_steps=args.max_steps, eval_steps=args.eval_steps, save_steps=args.save_steps, @@ -199,60 +173,41 @@ def main(args): report_to="tensorboard", seed=args.seed, ) - - # model - model, peft_config, tokenizer = create_and_prepare_model(args) + + model = create_and_prepare_model(args) model.config.use_cache = False # datasets ## ToDo uncomment once drive goes public - #train_url = "https://drive.google.com/file/d/1-JgY1mEafcJ7qhggt6UR3OEKAciIPd5s/view?usp=sharing" - #eval_url = "https://drive.google.com/file/d/1jrm6Lacrq49AYv0uB_Qy22xRmfPixQvs/view?usp=sharing" - #dataset = load_dataset("parquet", data_files={'train': train_url, 'validation': eval_url}) - dataset = load_dataset("parquet", data_files={'train': 'dataset/train-00000-of-00001.parquet', 'validation': 'dataset/validation-00000-of-00001.parquet'}) + # train_url = "https://drive.google.com/file/d/1-JgY1mEafcJ7qhggt6UR3OEKAciIPd5s/view?usp=sharing" + # eval_url = "https://drive.google.com/file/d/1jrm6Lacrq49AYv0uB_Qy22xRmfPixQvs/view?usp=sharing" + # dataset = load_dataset("parquet", data_files={'train': train_url, 'validation': eval_url}) + dataset = load_dataset( + "parquet", + data_files={ + "train": f"{args.dataset_path}/train-00000-of-00001.parquet", + "validation": f"{args.dataset_path}/validation-00000-of-00001.parquet", + }, + ) train_dataset, eval_dataset = dataset["train"], dataset["validation"] - - - world_size = world_size_from_yaml(args.config_path) - general_info(loralogger,args,world_size=world_size,eval_samples=len(eval_dataset),train_samples=len(train_dataset)) - optimization_info(loralogger,args) - - # trainer trainer = Trainer( model=model, args=training_arguments, train_dataset=train_dataset, eval_dataset=eval_dataset, - callbacks=[MLPerfCallback(loralogger)], + callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset))], ) - trainer.training_step = functools.partial(training_step, trainer) + trainer.training_step = functools.partial(training_step, trainer) trainer.accelerator.print(f"{trainer.model}") if args.use_peft_lora: trainer.model.print_trainable_parameters() - if is_deepspeed_peft_enabled: - trainer.add_callback( - SaveDeepSpeedPeftModelCallback(trainer, save_steps=args.save_steps) - ) - if args.use_peft_lora: peft_module_casting_to_bf16(trainer.model, args) - # train trainer.train() - # Save the PEFT adapter on main process - if trainer.args.process_index == 0: - if args.push_to_hub: - print("Push to hub...") - trainer.push_to_hub() - if args.use_peft_lora: - trainer.model.push_to_hub(args.output_dir) - else: - print("Save model...") - unwrap_model(trainer.model).save_pretrained(args.output_dir) - if __name__ == "__main__": parser = HfArgumentParser(ScriptArguments) diff --git a/llm_finetuning/scripts/utils.py b/llm_finetuning/scripts/utils.py new file mode 100644 index 000000000..0dc7f2c30 --- /dev/null +++ b/llm_finetuning/scripts/utils.py @@ -0,0 +1,231 @@ +import torch +from datasets import load_dataset +from deepspeed.accelerator import get_accelerator +from peft import LoraConfig, get_peft_model +from peft.tuners.lora import LoraLayer +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.utils import is_apex_available, is_sagemaker_mp_enabled + +if is_apex_available(): + from apex import amp + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp + from smdistributed.modelparallel import __version__ as SMP_VERSION + + IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10") + + from .trainer_pt_utils import ( + smp_forward_backward, + smp_forward_only, + smp_gather, + smp_nested_concat, + ) +else: + IS_SAGEMAKER_MP_POST_1_10 = False + +from functools import partial +from itertools import chain +from typing import Any, Dict, Union + + +def group_texts(examples, block_size): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + if "labels" not in result: + result["labels"] = result["input_ids"].copy() + return result + + +def create_datasets(tokenizer, args): + dataset = load_dataset( + args.dataset_name, + args.dataset_config_name, + use_auth_token=True, + num_proc=args.num_workers, + ) + train_dataset = dataset["train"] + valid_dataset = dataset["validation"] + column_names = train_dataset.features + + def tokenize_function(example, eval=False): + output_texts = [] + mask_labels_sizes = [] + for i in range(len(example["input"])): + if "gov_report" in args.dataset_config_name: + output_texts.append( + f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n {example['output'][i]}{tokenizer.eos_token}" + ) + if eval: + mask_labels_sizes.append( + f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n" + ) + else: + output_texts.append( + f"### {example['input'][i]}\n ### The answer is:\n {example['output'][i]}{tokenizer.eos_token}" + ) + + input_ids = tokenizer(output_texts).input_ids + + if eval: + labels_ids = tokenizer(mask_labels_sizes).input_ids + masked_labels = [] + for out, lb in zip(input_ids, labels_ids): + ml = out.copy() + ml[: len(lb)] = [-100] * len(lb) + ml[-1] = -100 + masked_labels.append(ml) + return {"input_ids": input_ids, "labels": masked_labels} + else: + return {"input_ids": input_ids} + + train_dataset = train_dataset.map( + tokenize_function, + batched=True, + num_proc=8, + remove_columns=column_names, + ) + valid_dataset = valid_dataset.map( + partial(tokenize_function, eval=True), + batched=True, + num_proc=2, + remove_columns=column_names, + ) + + def filter_function(example): + to_keep = [] + for i in range(len(example["input_ids"])): + if len(example["input_ids"][i]) > args.max_seq_length: + to_keep.append(False) + else: + to_keep.append(True) + return to_keep + + train_dataset = train_dataset.filter( + filter_function, + batched=True, + # with_indices=True, + num_proc=8, + # remove_columns=column_names, + ) + valid_dataset = valid_dataset.filter( + filter_function, + batched=True, + # with_indices=True, + num_proc=2, + # remove_columns=column_names, + ) + print( + f"Before packing, Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" + ) + + packing_method = partial(group_texts, block_size=args.max_seq_length) + # Packing + train_dataset = train_dataset.map( + packing_method, + batched=True, + num_proc=8, + ) + valid_dataset = valid_dataset.map( + packing_method, + batched=True, + num_proc=2, + ) + + print( + f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" + ) + + return train_dataset, valid_dataset + + +def create_and_prepare_model(args): + device_map = None + + model = AutoModelForCausalLM.from_pretrained( + args.model_path, + device_map=device_map, + use_cache=not args.use_gradient_checkpointing, + trust_remote_code=True, + use_flash_attention_2=True if args.use_flash_attn else False, + torch_dtype=torch.bfloat16, + max_position_embeddings=8192, + ) + + peft_config = None + if args.use_peft_lora: + peft_config = LoraConfig( + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + bias="none", + task_type="CAUSAL_LM", + target_modules=( + None + if args.lora_target_modules is None + else args.lora_target_modules.split(",") + ), + ) + if args.use_gradient_checkpointing: + model.gradient_checkpointing_enable() + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + return model + + +def training_step( + self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]] +) -> torch.Tensor: + """ + Perform a training step on a batch of inputs. + Subclass and override to inject custom behavior. + Args: + model (`nn.Module`): + The model to train. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + Return: + `torch.Tensor`: The tensor with training loss on this batch. + """ + model.train() + inputs = self._prepare_inputs(inputs) + if is_sagemaker_mp_enabled(): + loss_mb = smp_forward_backward( + model, inputs, self.args.gradient_accumulation_steps + ) + return loss_mb.reduce_mean().detach().to(self.args.device) + with self.compute_loss_context_manager(): + loss = self.compute_loss(model, inputs) + if self.args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if self.use_apex: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + self.accelerator.backward(loss) + return loss.detach() / self.args.gradient_accumulation_steps + + +def peft_module_casting_to_bf16(model, args): + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + if args.bf16: + module = module.to(torch.bfloat16) + if "norm" in name: + module = module.to(torch.float32) + if any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]): + if hasattr(module, "weight"): + if args.bf16 and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) From 7491573ea692c77c80ddade0aa33852293f3f7d7 Mon Sep 17 00:00:00 2001 From: Michal Futrega Date: Fri, 23 Feb 2024 14:29:57 +0100 Subject: [PATCH 10/16] Fix eval batch size, add Dockerfile, improve logging, remove unused code --- llm_finetuning/Dockerfile | 2 -- llm_finetuning/requirements.txt | 11 ++++++----- llm_finetuning/scripts/train.py | 28 ++++++++++------------------ llm_finetuning/scripts/utils.py | 2 +- 4 files changed, 17 insertions(+), 26 deletions(-) diff --git a/llm_finetuning/Dockerfile b/llm_finetuning/Dockerfile index e56eb8960..c14813613 100644 --- a/llm_finetuning/Dockerfile +++ b/llm_finetuning/Dockerfile @@ -4,7 +4,5 @@ FROM ${FROM_IMAGE_NAME} WORKDIR /workspace/ft-llm ADD . /workspace/ft-llm -ENV OMP_NUM_THREADS=8 - RUN pip install -r requirements.txt RUN pip install flash-attn==2.4.1 --no-build-isolation diff --git a/llm_finetuning/requirements.txt b/llm_finetuning/requirements.txt index 474a3e297..74e6e9db4 100644 --- a/llm_finetuning/requirements.txt +++ b/llm_finetuning/requirements.txt @@ -1,5 +1,6 @@ -transformers -accelerate -peft -datasets -deepspeed \ No newline at end of file +git+https://github.com/mlcommons/logging.git +transformers==4.38.1 +accelerate==0.27.2 +peft==0.8.2 +datasets==2.17.1 +deepspeed==0.13.2 \ No newline at end of file diff --git a/llm_finetuning/scripts/train.py b/llm_finetuning/scripts/train.py index 34e1d0849..94a7007c5 100644 --- a/llm_finetuning/scripts/train.py +++ b/llm_finetuning/scripts/train.py @@ -32,29 +32,22 @@ class ScriptArguments: local_rank: Optional[int] = field( default=-1, metadata={"help": "Used for multi-gpu"} ) - - per_device_train_batch_size: Optional[int] = field(default=4) + per_device_train_batch_size: Optional[int] = field(default=1) per_device_eval_batch_size: Optional[int] = field(default=1) - gradient_accumulation_steps: Optional[int] = field(default=4) + gradient_accumulation_steps: Optional[int] = field(default=1) learning_rate: Optional[float] = field(default=2e-4) - max_grad_norm: Optional[float] = field(default=0.3) + max_grad_norm: Optional[float] = field(default=0.0) weight_decay: Optional[float] = field(default=0.001) - lora_alpha: Optional[int] = field(default=16) + lora_alpha: Optional[int] = field(default=32) lora_dropout: Optional[float] = field(default=0.1) - lora_r: Optional[int] = field(default=64) + lora_r: Optional[int] = field(default=16) lora_target_modules: Optional[str] = field( default=None, metadata={ "help": "comma separated list of target modules to apply LoRA layers to" }, ) - max_seq_length: Optional[int] = field(default=512) - model_name: Optional[str] = field( - default=None, - metadata={ - "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc." - }, - ) + max_seq_length: Optional[int] = field(default=8192) model_path: Optional[str] = field( default="./llama-v2-fused-qkv", metadata={"help": "Path to the model directory."}, @@ -84,11 +77,11 @@ class ScriptArguments: metadata={"help": "Enables gradient checkpointing."}, ) optim: Optional[str] = field( - default="paged_adamw_32bit", + default="adamw_torch", metadata={"help": "The optimizer to use."}, ) lr_scheduler_type: str = field( - default="constant", + default="cosine", metadata={ "help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis" }, @@ -113,11 +106,11 @@ class ScriptArguments: default="results", metadata={"help": "Where to store the final model."} ) use_flash_attn: Optional[bool] = field( - default=False, + default=True, metadata={"help": "Enables Flash attention for training."}, ) use_peft_lora: Optional[bool] = field( - default=False, + default=True, metadata={"help": "Enables PEFT LoRA for training."}, ) use_gradient_checkpointing: Optional[bool] = field( @@ -140,7 +133,6 @@ class ScriptArguments: "help": "If True, tests things like proper saving/loading/logging of model" }, ) - dataset_config_name: Optional[str] = field(default="gov_report") hub_model_id: Optional[str] = field(default=None) seed: Optional[int] = field(default=42) diff --git a/llm_finetuning/scripts/utils.py b/llm_finetuning/scripts/utils.py index 0dc7f2c30..a4c376cdb 100644 --- a/llm_finetuning/scripts/utils.py +++ b/llm_finetuning/scripts/utils.py @@ -156,7 +156,7 @@ def create_and_prepare_model(args): device_map=device_map, use_cache=not args.use_gradient_checkpointing, trust_remote_code=True, - use_flash_attention_2=True if args.use_flash_attn else False, + attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16, max_position_embeddings=8192, ) From 4074852b5f57321c925bdc7dd8d534c1aac65962 Mon Sep 17 00:00:00 2001 From: Michal Futrega Date: Sun, 25 Feb 2024 11:19:14 +0100 Subject: [PATCH 11/16] Remove training_step --- llm_finetuning/scripts/train.py | 13 +++---- llm_finetuning/scripts/utils.py | 64 +++------------------------------ 2 files changed, 8 insertions(+), 69 deletions(-) diff --git a/llm_finetuning/scripts/train.py b/llm_finetuning/scripts/train.py index 94a7007c5..a1b6fcdd4 100644 --- a/llm_finetuning/scripts/train.py +++ b/llm_finetuning/scripts/train.py @@ -13,14 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import functools from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset from mlperf_logging_utils import LoraLogger, MLPerfCallback from transformers import HfArgumentParser, Trainer, TrainingArguments -from utils import create_and_prepare_model, peft_module_casting_to_bf16, training_step +from utils import create_and_prepare_model, peft_module_casting_to_bf16 @dataclass @@ -95,9 +94,9 @@ class ScriptArguments: save_steps: int = field( default=10, metadata={"help": "Save checkpoint every X updates steps."} ) - eval_steps: int = field(default=24, metadata={"help": "Eval model every X steps."}) + eval_steps: int = field(default=22, metadata={"help": "Eval model every X steps."}) logging_steps: int = field( - default=6, metadata={"help": "Log every X updates steps."} + default=10, metadata={"help": "Log every X updates steps."} ) target_eval_loss: float = field( default=0.92, metadata={"help": "target eval loss - NOT FINAL."} @@ -114,12 +113,9 @@ class ScriptArguments: metadata={"help": "Enables PEFT LoRA for training."}, ) use_gradient_checkpointing: Optional[bool] = field( - default=False, + default=True, metadata={"help": "Enables Gradient Checkpointing."}, ) - dataset_text_field: str = field( - default="text", metadata={"help": "Dataset field to use as input text."} - ) push_to_hub: Optional[bool] = field( default=False, metadata={"help": "If True, pushes the model to the HF Hub"}, @@ -190,7 +186,6 @@ def main(args): eval_dataset=eval_dataset, callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset))], ) - trainer.training_step = functools.partial(training_step, trainer) trainer.accelerator.print(f"{trainer.model}") if args.use_peft_lora: trainer.model.print_trainable_parameters() diff --git a/llm_finetuning/scripts/utils.py b/llm_finetuning/scripts/utils.py index a4c376cdb..84821ffb4 100644 --- a/llm_finetuning/scripts/utils.py +++ b/llm_finetuning/scripts/utils.py @@ -1,32 +1,11 @@ +from functools import partial +from itertools import chain + import torch from datasets import load_dataset -from deepspeed.accelerator import get_accelerator from peft import LoraConfig, get_peft_model from peft.tuners.lora import LoraLayer -from transformers import AutoModelForCausalLM, AutoTokenizer -from transformers.utils import is_apex_available, is_sagemaker_mp_enabled - -if is_apex_available(): - from apex import amp - -if is_sagemaker_mp_enabled(): - import smdistributed.modelparallel.torch as smp - from smdistributed.modelparallel import __version__ as SMP_VERSION - - IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10") - - from .trainer_pt_utils import ( - smp_forward_backward, - smp_forward_only, - smp_gather, - smp_nested_concat, - ) -else: - IS_SAGEMAKER_MP_POST_1_10 = False - -from functools import partial -from itertools import chain -from typing import Any, Dict, Union +from transformers import AutoModelForCausalLM def group_texts(examples, block_size): @@ -183,41 +162,6 @@ def create_and_prepare_model(args): return model -def training_step( - self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]] -) -> torch.Tensor: - """ - Perform a training step on a batch of inputs. - Subclass and override to inject custom behavior. - Args: - model (`nn.Module`): - The model to train. - inputs (`Dict[str, Union[torch.Tensor, Any]]`): - The inputs and targets of the model. - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument `labels`. Check your model's documentation for all accepted arguments. - Return: - `torch.Tensor`: The tensor with training loss on this batch. - """ - model.train() - inputs = self._prepare_inputs(inputs) - if is_sagemaker_mp_enabled(): - loss_mb = smp_forward_backward( - model, inputs, self.args.gradient_accumulation_steps - ) - return loss_mb.reduce_mean().detach().to(self.args.device) - with self.compute_loss_context_manager(): - loss = self.compute_loss(model, inputs) - if self.args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if self.use_apex: - with amp.scale_loss(loss, self.optimizer) as scaled_loss: - scaled_loss.backward() - else: - self.accelerator.backward(loss) - return loss.detach() / self.args.gradient_accumulation_steps - - def peft_module_casting_to_bf16(model, args): for name, module in model.named_modules(): if isinstance(module, LoraLayer): From ac0eb0d0644f3aa42f103f4657f52d80ffc01c63 Mon Sep 17 00:00:00 2001 From: itayhubara Date: Mon, 26 Feb 2024 13:54:03 +0200 Subject: [PATCH 12/16] renaming directory and adding more HP values to logger --- {llm_finetuning => llama2_70b_lora}/Dockerfile | 0 {llm_finetuning => llama2_70b_lora}/README.md | 0 .../configs/default_config.yaml | 0 {llm_finetuning => llama2_70b_lora}/convergence_example.txt | 0 {llm_finetuning => llama2_70b_lora}/requirements.txt | 0 {llm_finetuning => llama2_70b_lora}/run_docker.sh | 0 .../run_llama_70B_scrolls_r16.sh | 0 .../scripts/mlperf_logging_utils.py | 4 +++- {llm_finetuning => llama2_70b_lora}/scripts/train.py | 4 ++-- {llm_finetuning => llama2_70b_lora}/scripts/utils.py | 0 10 files changed, 5 insertions(+), 3 deletions(-) rename {llm_finetuning => llama2_70b_lora}/Dockerfile (100%) rename {llm_finetuning => llama2_70b_lora}/README.md (100%) rename {llm_finetuning => llama2_70b_lora}/configs/default_config.yaml (100%) rename {llm_finetuning => llama2_70b_lora}/convergence_example.txt (100%) rename {llm_finetuning => llama2_70b_lora}/requirements.txt (100%) rename {llm_finetuning => llama2_70b_lora}/run_docker.sh (100%) rename {llm_finetuning => llama2_70b_lora}/run_llama_70B_scrolls_r16.sh (100%) rename {llm_finetuning => llama2_70b_lora}/scripts/mlperf_logging_utils.py (93%) rename {llm_finetuning => llama2_70b_lora}/scripts/train.py (97%) rename {llm_finetuning => llama2_70b_lora}/scripts/utils.py (100%) diff --git a/llm_finetuning/Dockerfile b/llama2_70b_lora/Dockerfile similarity index 100% rename from llm_finetuning/Dockerfile rename to llama2_70b_lora/Dockerfile diff --git a/llm_finetuning/README.md b/llama2_70b_lora/README.md similarity index 100% rename from llm_finetuning/README.md rename to llama2_70b_lora/README.md diff --git a/llm_finetuning/configs/default_config.yaml b/llama2_70b_lora/configs/default_config.yaml similarity index 100% rename from llm_finetuning/configs/default_config.yaml rename to llama2_70b_lora/configs/default_config.yaml diff --git a/llm_finetuning/convergence_example.txt b/llama2_70b_lora/convergence_example.txt similarity index 100% rename from llm_finetuning/convergence_example.txt rename to llama2_70b_lora/convergence_example.txt diff --git a/llm_finetuning/requirements.txt b/llama2_70b_lora/requirements.txt similarity index 100% rename from llm_finetuning/requirements.txt rename to llama2_70b_lora/requirements.txt diff --git a/llm_finetuning/run_docker.sh b/llama2_70b_lora/run_docker.sh similarity index 100% rename from llm_finetuning/run_docker.sh rename to llama2_70b_lora/run_docker.sh diff --git a/llm_finetuning/run_llama_70B_scrolls_r16.sh b/llama2_70b_lora/run_llama_70B_scrolls_r16.sh similarity index 100% rename from llm_finetuning/run_llama_70B_scrolls_r16.sh rename to llama2_70b_lora/run_llama_70B_scrolls_r16.sh diff --git a/llm_finetuning/scripts/mlperf_logging_utils.py b/llama2_70b_lora/scripts/mlperf_logging_utils.py similarity index 93% rename from llm_finetuning/scripts/mlperf_logging_utils.py rename to llama2_70b_lora/scripts/mlperf_logging_utils.py index f8b28035c..262d9c22a 100644 --- a/llm_finetuning/scripts/mlperf_logging_utils.py +++ b/llama2_70b_lora/scripts/mlperf_logging_utils.py @@ -77,7 +77,7 @@ def __init__(self, logger, train_dataset_length, eval_dataset_length): super().__init__() self.mllogger = logger self.submission_info = { - "submission_benchmark": "llm-finetuning", + "submission_benchmark": "llama2_70b_lora", "submission_division": "Closed", "submission_org": "referece", "submission_platform": "referece", @@ -134,6 +134,8 @@ def on_train_begin(self, args, state, control, **kwargs): self.mllogger.event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_ratio) self.mllogger.event(key=constants.OPT_LR_TRAINING_STEPS, value=args.max_steps) self.mllogger.event(key=constants.OPT_BASE_LR, value=args.learning_rate) + self.mllogger.event(key=constants.LORA_ALPHA, value=args.lora_alpha) + self.mllogger.event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=args.gradient_accumulation_steps) self.mllogger.start(constants.RUN_START, value="") def on_step_begin( diff --git a/llm_finetuning/scripts/train.py b/llama2_70b_lora/scripts/train.py similarity index 97% rename from llm_finetuning/scripts/train.py rename to llama2_70b_lora/scripts/train.py index a1b6fcdd4..cef0e65f4 100644 --- a/llm_finetuning/scripts/train.py +++ b/llama2_70b_lora/scripts/train.py @@ -38,8 +38,8 @@ class ScriptArguments: max_grad_norm: Optional[float] = field(default=0.0) weight_decay: Optional[float] = field(default=0.001) lora_alpha: Optional[int] = field(default=32) - lora_dropout: Optional[float] = field(default=0.1) - lora_r: Optional[int] = field(default=16) + lora_dropout: Optional[float] = field(default=0.1, metadata={"lora dropout is a fixed to 0.1 in closed submission"}) + lora_r: Optional[int] = field(default=16, metadata={"lora rank is a fixed to 16 in closed submission"}) lora_target_modules: Optional[str] = field( default=None, metadata={ diff --git a/llm_finetuning/scripts/utils.py b/llama2_70b_lora/scripts/utils.py similarity index 100% rename from llm_finetuning/scripts/utils.py rename to llama2_70b_lora/scripts/utils.py From aa8415df2dea9b5cf486a89d1bfc6727e4ee146b Mon Sep 17 00:00:00 2001 From: itayhubara Date: Wed, 28 Feb 2024 13:02:28 +0200 Subject: [PATCH 13/16] adding weight decay to TrainingArguments and BLOCK_START BLOCK_STOP --- llama2_70b_lora/scripts/mlperf_logging_utils.py | 10 ++++++++++ llama2_70b_lora/scripts/train.py | 1 + 2 files changed, 11 insertions(+) diff --git a/llama2_70b_lora/scripts/mlperf_logging_utils.py b/llama2_70b_lora/scripts/mlperf_logging_utils.py index 262d9c22a..c11c0ce2c 100644 --- a/llama2_70b_lora/scripts/mlperf_logging_utils.py +++ b/llama2_70b_lora/scripts/mlperf_logging_utils.py @@ -162,11 +162,21 @@ def on_step_begin( control.should_log = True if state.global_step % (state.eval_steps) == 0 and state.global_step > 0: + self.mllogger.end( + constants.BLOCK_STOP, + value="", + metadata={"step_num": state.log_history[-1]["step"]}, + ) self.mllogger.event( "eval_loss", value=state.log_history[-1]["eval_loss"], metadata={"step_num": state.log_history[-1]["step"]}, ) + self.mllogger.start( + constants.BLOCK_START, + value="", + metadata={"step_num": state.log_history[-1]["step"]}, + ) control.should_log = True eval_loss_list = [ sl["eval_loss"] for sl in state.log_history if "eval_loss" in sl diff --git a/llama2_70b_lora/scripts/train.py b/llama2_70b_lora/scripts/train.py index cef0e65f4..01aa346c1 100644 --- a/llama2_70b_lora/scripts/train.py +++ b/llama2_70b_lora/scripts/train.py @@ -146,6 +146,7 @@ def main(args): fp16=args.fp16, bf16=args.bf16, max_grad_norm=args.max_grad_norm, + weight_decay=args.weight_decay, warmup_ratio=args.warmup_ratio, lr_scheduler_type=args.lr_scheduler_type, num_train_epochs=args.num_train_epochs, From a8efc514945de1ccae46321846ccb21c0630da57 Mon Sep 17 00:00:00 2001 From: itayhubara Date: Thu, 29 Feb 2024 12:54:57 +0200 Subject: [PATCH 14/16] editing logging to resolve all checker issues --- llama2_70b_lora/run_llama_70B_scrolls_r16.sh | 9 +++---- .../scripts/mlperf_logging_utils.py | 25 +++++++++++++------ llama2_70b_lora/scripts/train.py | 6 ++--- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/llama2_70b_lora/run_llama_70B_scrolls_r16.sh b/llama2_70b_lora/run_llama_70B_scrolls_r16.sh index 759b9aeb4..2b5377b91 100644 --- a/llama2_70b_lora/run_llama_70B_scrolls_r16.sh +++ b/llama2_70b_lora/run_llama_70B_scrolls_r16.sh @@ -1,16 +1,13 @@ accelerate launch --config_file configs/default_config.yaml scripts/train.py \ ---model_name meta-llama/Llama-2-70b-hf \ --dataset_path "./dataset" \ ---model_path "./llama-v2-fused-qkv" \ +--model_path "/software/users/ihubara/lora_clean/llama-v2-fused-qkv" \ --max_seq_len 8192 \ --bf16 True \ ---logging_steps 2 \ ---eval_steps 6 \ ---save_steps 999 \ +--logging_steps 32 \ +--eval_steps 64 \ --output_dir "./results/llama-70b_scrolls_gov_report_r16_$1" \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ ---dataset_text_field "input" \ --lr_scheduler_type "cosine" \ --learning_rate 5e-4 \ --warmup_ratio 0 \ diff --git a/llama2_70b_lora/scripts/mlperf_logging_utils.py b/llama2_70b_lora/scripts/mlperf_logging_utils.py index c11c0ce2c..cda18df41 100644 --- a/llama2_70b_lora/scripts/mlperf_logging_utils.py +++ b/llama2_70b_lora/scripts/mlperf_logging_utils.py @@ -73,22 +73,27 @@ def end(self, key, value=None, metadata=None, sync=False, log_rank=None): class MLPerfCallback(TrainerCallback): "A callback that prints a message at the beginning of training" - def __init__(self, logger, train_dataset_length, eval_dataset_length): + def __init__(self, logger, train_dataset_length, eval_dataset_length,lora_alpha): super().__init__() self.mllogger = logger self.submission_info = { "submission_benchmark": "llama2_70b_lora", - "submission_division": "Closed", + "submission_division": "closed", "submission_org": "referece", "submission_platform": "referece", "submission_poc_name": "referece", "submission_poc_email": "referece", - "submission_status": "referece", + "submission_status": "onprem", "train_dataset_length": train_dataset_length, "eval_dataset_length": eval_dataset_length, + "lora_alpha": lora_alpha } def on_train_begin(self, args, state, control, **kwargs): + self.gbs=args.per_device_train_batch_size * args.gradient_accumulation_steps * os.getenv("WORLD_SIZE", 1) + self.mllogger.event( + key=constants.CACHE_CLEAR, value="True", + ) self.mllogger.event( key=constants.SUBMISSION_BENCHMARK, value=self.submission_info["submission_benchmark"], @@ -133,9 +138,15 @@ def on_train_begin(self, args, state, control, **kwargs): self.mllogger.event(key=constants.SEED, value=args.seed) self.mllogger.event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_ratio) self.mllogger.event(key=constants.OPT_LR_TRAINING_STEPS, value=args.max_steps) + self.mllogger.event(key=constants.OPT_ADAMW_WEIGHT_DECAY, value=args.weight_decay) + self.mllogger.event(key=constants.OPT_GRADIENT_CLIP_NORM, value=args.max_grad_norm) self.mllogger.event(key=constants.OPT_BASE_LR, value=args.learning_rate) - self.mllogger.event(key=constants.LORA_ALPHA, value=args.lora_alpha) + self.mllogger.event(key=constants.LORA_ALPHA, value=self.submission_info["lora_alpha"]) + self.mllogger.event(key='lora_rank', value=16) self.mllogger.event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=args.gradient_accumulation_steps) + self.mllogger.start(key=constants.INIT_START, value="") + # device warmup should be done here + self.mllogger.end(key=constants.INIT_STOP, value="") self.mllogger.start(constants.RUN_START, value="") def on_step_begin( @@ -168,9 +179,9 @@ def on_step_begin( metadata={"step_num": state.log_history[-1]["step"]}, ) self.mllogger.event( - "eval_loss", + constants.EVAL_ACCURACY, value=state.log_history[-1]["eval_loss"], - metadata={"step_num": state.log_history[-1]["step"]}, + metadata={"samples_num": state.log_history[-1]["step"]*self.gbs}, ) self.mllogger.start( constants.BLOCK_START, @@ -187,7 +198,7 @@ def on_step_begin( constants.RUN_STOP, value=eval_loss_list[-1], metadata={ - "step_num": state.log_history[-1]["step"], + "samples_num": state.log_history[-1]["step"]*self.gbs, "status": "success", }, ) diff --git a/llama2_70b_lora/scripts/train.py b/llama2_70b_lora/scripts/train.py index 01aa346c1..afe09912e 100644 --- a/llama2_70b_lora/scripts/train.py +++ b/llama2_70b_lora/scripts/train.py @@ -38,8 +38,8 @@ class ScriptArguments: max_grad_norm: Optional[float] = field(default=0.0) weight_decay: Optional[float] = field(default=0.001) lora_alpha: Optional[int] = field(default=32) - lora_dropout: Optional[float] = field(default=0.1, metadata={"lora dropout is a fixed to 0.1 in closed submission"}) - lora_r: Optional[int] = field(default=16, metadata={"lora rank is a fixed to 16 in closed submission"}) + lora_dropout: Optional[float] = field(default=0.1, metadata={"help": "lora dropout is a fixed to 0.1 in closed submission"}) + lora_r: Optional[int] = field(default=16, metadata={"help": "lora rank is a fixed to 16 in closed submission"}) lora_target_modules: Optional[str] = field( default=None, metadata={ @@ -185,7 +185,7 @@ def main(args): args=training_arguments, train_dataset=train_dataset, eval_dataset=eval_dataset, - callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset))], + callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset),args.lora_alpha)], ) trainer.accelerator.print(f"{trainer.model}") if args.use_peft_lora: From 552c046cc7cf30eefe19a9978e29579c0d120c76 Mon Sep 17 00:00:00 2001 From: itayhubara Date: Thu, 29 Feb 2024 16:48:49 +0200 Subject: [PATCH 15/16] fix issue in steps_num logging --- llama2_70b_lora/scripts/mlperf_logging_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama2_70b_lora/scripts/mlperf_logging_utils.py b/llama2_70b_lora/scripts/mlperf_logging_utils.py index cda18df41..8be697ac6 100644 --- a/llama2_70b_lora/scripts/mlperf_logging_utils.py +++ b/llama2_70b_lora/scripts/mlperf_logging_utils.py @@ -90,7 +90,7 @@ def __init__(self, logger, train_dataset_length, eval_dataset_length,lora_alpha) } def on_train_begin(self, args, state, control, **kwargs): - self.gbs=args.per_device_train_batch_size * args.gradient_accumulation_steps * os.getenv("WORLD_SIZE", 1) + self.gbs=int(args.per_device_train_batch_size * args.gradient_accumulation_steps * os.getenv("WORLD_SIZE", 1)) self.mllogger.event( key=constants.CACHE_CLEAR, value="True", ) From 5970ae927fc0e831db1c7cec127ec5763996daa5 Mon Sep 17 00:00:00 2001 From: itayhubara Date: Mon, 11 Mar 2024 20:11:52 +0200 Subject: [PATCH 16/16] updating bash script for GBS=8 --- llama2_70b_lora/run_llama_70B_scrolls_r16.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llama2_70b_lora/run_llama_70B_scrolls_r16.sh b/llama2_70b_lora/run_llama_70B_scrolls_r16.sh index 2b5377b91..b61f63cae 100644 --- a/llama2_70b_lora/run_llama_70B_scrolls_r16.sh +++ b/llama2_70b_lora/run_llama_70B_scrolls_r16.sh @@ -3,21 +3,23 @@ accelerate launch --config_file configs/default_config.yaml scripts/train.py \ --model_path "/software/users/ihubara/lora_clean/llama-v2-fused-qkv" \ --max_seq_len 8192 \ --bf16 True \ ---logging_steps 32 \ ---eval_steps 64 \ +--logging_steps 24 \ +--eval_steps 48 \ --output_dir "./results/llama-70b_scrolls_gov_report_r16_$1" \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ --lr_scheduler_type "cosine" \ ---learning_rate 5e-4 \ +--learning_rate 4e-4 \ +--weight_decay 0.0001 \ --warmup_ratio 0 \ +--max_grad_norm 0.3 \ --use_gradient_checkpointing True \ --target_eval_loss 0.925 \ --use_peft_lora True \ --lora_r 16 \ ---lora_alpha 16 \ +--lora_alpha 32 \ --lora_dropout 0.1 \ ---max_steps 800 \ +--max_steps 1024 \ --use_flash_attn \ --seed "$1" \ --lora_target_modules "qkv_proj,o_proj"