Merge remote-tracking branch 'origin/main' into utility_to_post-proce…

…ss_LoRA Signed-off-by: Will Johnson <[email protected]>
foundation-model-stack · Sep 18, 2024 · fb1dcc9 · fb1dcc9
2 parents af191d1 + 229e230
commit fb1dcc9
Show file tree

Hide file tree

Showing 9 changed files with 400 additions and 115 deletions.
diff --git a/.github/workflows/labelpr.yaml b/.github/workflows/labelpr.yaml
@@ -0,0 +1,35 @@
+name: Label PRs
+
+on:
+  pull_request_target:
+    types: [opened, edited, synchronize, reopened]
+
+jobs:
+  label_pr:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/github-script@v3
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const pr_welcome_msg = `Thanks for making a pull request! 😃\nOne of the maintainers will review and advise on the next steps.`;
+            // https://github.com/commitizen/conventional-commit-types
+            const valid_pr_types = ['feat', 'fix', 'docs', 'style', 'refactor', 'perf', 'test', 'build', 'ci', 'chore', 'revert'];
+
+            if(context.payload.pull_request.comments === 0) {
+              await github.issues.createComment({ ...context.repo, issue_number: context.payload.number, body: pr_welcome_msg});
+            }
+
+            const title = context.payload.pull_request.title;
+            const results = /^(\w+)(\(\w+\))?!?:/.exec(title);
+            if (results === null) return core.setFailed(`The title does not follow conventional commits spec: https://www.conventionalcommits.org/en/v1.0.0/#summary Title: ${title}`);
+
+            const pr_type = results[1];
+            core.info(`pr_type: ${pr_type}`);
+
+            if (!valid_pr_types.includes(pr_type)) return core.setFailed(`Unknown pull request type: ${pr_type}`);
+
+            const labels = context.payload.pull_request.labels;
+            const new_labels = labels.filter(label => !valid_pr_types.includes(label.name)); // keep all labels that are not in valid_pr_types
+            new_labels.push({name: pr_type});
+            await github.issues.update({ ...context.repo, issue_number: context.payload.number, labels: new_labels });
diff --git a/.pylintrc b/.pylintrc
@@ -333,7 +333,7 @@ indent-string='    '
 max-line-length=100
 
 # Maximum number of lines in a module.
-max-module-lines=1100
+max-module-lines=1200
 
 # Allow the body of a class to be on the same line as the declaration if body
 # contains single statement.

diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@
   - [Tips on Parameters to Set](#tips-on-parameters-to-set)
 - [Tuning Techniques](#tuning-techniques)
   - [LoRA Tuning Example](#lora-tuning-example)
+  - [GPTQ-LoRA with AutoGPTQ Tuning Example](#gptq-lora-with-autogptq-tuning-example)
   - [Prompt Tuning](#prompt-tuning)
   - [Fine Tuning](#fine-tuning)
   - [FMS Acceleration](#fms-acceleration)
@@ -277,6 +278,11 @@ You can set `output_dir` to a local directory and set `save_model_dir` to COS to
 
 In order to achieve the fastest train time, set `save_strategy="no"`, as saving no checkpoints except for the final model will remove intermediate write operations all together.
 
+#### Resuming tuning from checkpoints
+If the output directory already contains checkpoints, tuning will automatically resume from the latest checkpoint in the directory specified by the `output_dir` flag. To start tuning from scratch and ignore existing checkpoints, set the `resume_from_checkpoint` flag to False.
+
+You can also use the resume_from_checkpoint flag to resume tuning from a specific checkpoint by providing the full path to the desired checkpoint as a string. This flag is passed as an argument to the [trainer.train()](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/src/transformers/trainer.py#L1901) function of the SFTTrainer.
+
 ## Tuning Techniques:
 
 ### LoRA Tuning Example
@@ -432,6 +438,70 @@ Example 3:
 
 _________________________
 
+
+### GPTQ-LoRA with AutoGPTQ Tuning Example
+
+This method is similar to LoRA Tuning, but the base model is a quantized model. We currently only support GPTQ-LoRA model that has been quantized with 4-bit AutoGPTQ technique. Bits-and-Bytes (BNB) quantized LoRA is not yet enabled.
+The qLoRA tuning technique is enabled via the [fms-acceleration](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/README.md#fms-acceleration) package.
+You can see details on a sample configuration of Accelerated GPTQ-LoRA [here](https://github.com/foundation-model-stack/fms-acceleration/blob/main/sample-configurations/accelerated-peft-autogptq-sample-configuration.yaml).
+
+
+To use GPTQ-LoRA technique, you can set the `quantized_lora_config` defined [here](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/acceleration_configs/quantized_lora_config.py). See the Notes section of FMS Acceleration doc [below](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/README.md#fms-acceleration) for usage. The only kernel we are supporting currently is `triton_v2`.
+
+In addition, LoRA tuning technique is required to be used, set `peft_method` to `"lora"` and pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21).
+
+Example command to run:
+
+```bash
+python tuning/sft_trainer.py \
+--model_name_or_path $MODEL_PATH \
+--tokenizer_name_or_path $MODEL_PATH \ # This field is optional and if not specified, tokenizer from model_name_or_path will be used
+--training_data_path $TRAIN_DATA_PATH \
+--output_dir $OUTPUT_PATH \
+--num_train_epochs 40 \
+--per_device_train_batch_size 4 \
+--learning_rate 1e-4 \
+--response_template "\n### Label:" \
+--dataset_text_field "output" \
+--peft_method "lora" \
+--r 8 \
+--lora_dropout 0.05 \
+--lora_alpha 16 \
+--target_modules c_attn c_proj \
+--auto_gptq triton_v2 \ # setting quantized_lora_config 
+--torch_dtype float16 \ # need this for triton_v2
+--fp16 \ # need this for triton_v2
+```
+
+Equally you can pass in a JSON configuration for running tuning. See [build doc](./build/README.md) for more details. The above can also be passed in as JSON:
+
+```json
+{
+    "model_name_or_path": $MODEL_PATH,
+    "training_data_path": $TRAIN_DATA_PATH,
+    "output_dir": $OUTPUT_PATH,
+    "num_train_epochs": 40.0,
+    "per_device_train_batch_size": 4,
+    "learning_rate": 1e-4,
+    "response_template": "\n### Label:",
+    "dataset_text_field": "output",
+    "peft_method": "lora",
+    "r": 8,
+    "lora_dropout": 0.05,
+    "lora_alpha": 16,
+    "target_modules": ["c_attn", "c_proj"],
+    "auto_gptq": ["triton_v2"], // setting quantized_lora_config
+    "torch_dtype": "float16", // need this for triton_v2
+    "fp16": true // need this for triton_v2
+}
+```
+
+Similarly to LoRA, the `target_modules` are the names of the modules to apply the adapter to. See the LoRA [section](#lora-tuning-example) on `target_modules` for more info.
+
+Note that with LoRA tuning technique, setting `all-linear` on `target_modules` returns linear modules. And with qLoRA tuning technique, `all-linear` returns all quant linear modules, excluding `lm_head`.
+
+_________________________
+
 ### Prompt Tuning:
 
 Specify `peft_method` to `'pt'` . You can additionally pass any arguments from [PromptTuningConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L63).
@@ -676,4 +746,4 @@ Further details on enabling and using the trackers mentioned above can be found
 
 [Prompt Tuning on Twitter Complaints](examples/prompt_tuning_twitter_complaints/README.md)
 
-A good simple example can be found [here](examples/kfto-kueue-sft-trainer.yaml) which launches a Kubernetes-native `PyTorchJob` using the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator/) with [Kueue](https://github.com/kubernetes-sigs/kueue) for the queue management of tuning jobs.
+A good simple example can be found [here](examples/kfto-kueue-sft-trainer.yaml) which launches a Kubernetes-native `PyTorchJob` using the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator/) with [Kueue](https://github.com/kubernetes-sigs/kueue) for the queue management of tuning jobs.
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -19,6 +19,9 @@ ARG USER=tuning
 ARG USER_UID=1000
 ARG PYTHON_VERSION=3.11
 ARG WHEEL_VERSION=""
+## Enable Aimstack if requested via ENABLE_AIM set to "true"
+ARG ENABLE_AIM=false
+ARG ENABLE_FMS_ACCELERATION=true
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
@@ -105,10 +108,8 @@ FROM cuda-devel AS python-installations
 ARG WHEEL_VERSION
 ARG USER
 ARG USER_UID
-ARG ENABLE_FMS_ACCELERATION=false
-
-## Enable Aimstack if requested via ENABLE_AIM set to "true"
-ARG ENABLE_AIM=false
+ARG ENABLE_FMS_ACCELERATION
+ARG ENABLE_AIM
 
 RUN dnf install -y git && \
     # perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
@@ -154,6 +155,7 @@ RUN python -m pip uninstall wheel build -y && \
 FROM release-base AS release
 ARG USER
 ARG PYTHON_VERSION
+ARG ENABLE_AIM
 
 RUN mkdir -p /licenses
 COPY LICENSE /licenses/

diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
@@ -24,37 +24,24 @@
 import sys
 import traceback
 from pathlib import Path
-import json
 
 # Third Party
 from accelerate.commands.launch import launch_command
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-from torch import bfloat16
 
 # Local
 from build.utils import (
     process_accelerate_launch_args,
-    get_highest_checkpoint,
 )
 from tuning.utils.config_utils import get_json_config
 from tuning.utils.error_logging import (
     write_termination_log,
     USER_ERROR_EXIT_CODE,
     INTERNAL_ERROR_EXIT_CODE,
 )
-from tuning.data import tokenizer_data_utils
 
 ERROR_LOG = "/dev/termination-log"
 
 
-def get_base_model_from_adapter_config(adapter_config):
-    """Given path to adapter_config.json file, returns the base model name"""
-    with open(adapter_config, "r", encoding="utf-8") as config_file:
-        adapter_config = json.load(config_file)
-        return adapter_config.get("base_model_name_or_path")
-
-
 def main():
     if not os.getenv("TERMINATION_LOG_FILE"):
         os.environ["TERMINATION_LOG_FILE"] = ERROR_LOG
@@ -128,85 +115,6 @@ def main():
         write_termination_log(f"Unhandled exception during training. {e}")
         sys.exit(INTERNAL_ERROR_EXIT_CODE)
 
-    # remove lm_head from granite with llama arch models
-    try:
-        checkpoint_dir = job_config.get("save_model_dir")
-        if not checkpoint_dir:
-            checkpoint_dir = os.path.join(
-                output_dir, get_highest_checkpoint(output_dir)
-            )
-
-        use_flash_attn = job_config.get("use_flash_attn", True)
-        adapter_config_path = os.path.join(checkpoint_dir, "adapter_config.json")
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
-
-        if os.path.exists(adapter_config_path):
-            base_model_path = get_base_model_from_adapter_config(adapter_config_path)
-            base_model = AutoModelForCausalLM.from_pretrained(
-                base_model_path,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-
-            # since the peft library (PEFTModelForCausalLM) does not handle cases
-            # where the model's layers are modified, in our case the embedding layer
-            # is modified, so we resize the backbone model's embedding layer with our own
-            # utility before passing it along to load the PEFT model.
-            tokenizer_data_utils.tokenizer_and_embedding_resize(
-                {}, tokenizer=tokenizer, model=base_model
-            )
-            model = PeftModel.from_pretrained(
-                base_model,
-                checkpoint_dir,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                checkpoint_dir,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-
-        model_arch = model.config.model_type
-        # check that it is a granite model with llama architecture with tied weights
-        # ie. lm_head is duplicate of embeddings
-
-        # a fine tuned model will have params_dict.get("model.embed_tokens.weight")
-        # a prompt adapter has params_dict.get("base_model.model.embed_tokens.weight")
-        # a lora adapter has params_dict.get("base_model.model.model.embed_tokens.weight")
-        if model_arch == "llama" and hasattr(model, "lm_head"):
-            if (
-                # lora tuned model has an addt model layer
-                (
-                    hasattr(model.model, "model")
-                    and model.lm_head.weight.untyped_storage().data_ptr()
-                    == model.model.model.embed_tokens.weight.untyped_storage().data_ptr()
-                )
-                # prompt tuned model or fine tuned model
-                or (
-                    hasattr(model.model, "embed_tokens")
-                    and model.lm_head.weight.untyped_storage().data_ptr()
-                    == model.model.embed_tokens.weight.untyped_storage().data_ptr()
-                )
-            ):
-
-                logging.info("Removing lm_head from checkpoint")
-                del model.lm_head.weight
-
-                if hasattr(model, "lm_head.weight"):
-                    logging.warning("Failed to delete lm_head.weight from model")
-
-                logging.info("Saving checkpoint to %s", output_dir)
-                model.save_pretrained(checkpoint_dir)
-                # save tokenizer with model
-                tokenizer.save_pretrained(checkpoint_dir)
-
-    except Exception as e:  # pylint: disable=broad-except
-        logging.error(traceback.format_exc())
-        write_termination_log(f"Exception encountered removing lm_head from model: {e}")
-        sys.exit(INTERNAL_ERROR_EXIT_CODE)
-
     # The .complete file will signal to users that we are finished copying
     # files over
     if os.path.exists(output_dir):

diff --git a/pyproject.toml b/pyproject.toml
@@ -45,6 +45,7 @@ dev = ["wheel>=0.42.0,<1.0", "packaging>=23.2,<25", "ninja>=1.11.1.1,<2.0", "sci
 flash-attn = ["flash-attn>=2.5.3,<3.0"]
 aim = ["aim>=3.19.0,<4.0"]
 fms-accel = ["fms-acceleration>=0.1"]
+gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]
 
 
 [tool.setuptools.packages.find]