From be9e3c03667ed8079dc9d5c3348a2da342b4381a Mon Sep 17 00:00:00 2001
From: Anh Uong <anh.uong@ibm.com>
Date: Wed, 4 Sep 2024 14:48:41 -0600
Subject: [PATCH 01/16] add enable_aim build args in all stages

Signed-off-by: Anh Uong <anh.uong@ibm.com>
---
 build/Dockerfile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/build/Dockerfile b/build/Dockerfile
index 116044f33..4bd9cab6a 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -19,6 +19,9 @@ ARG USER=tuning
 ARG USER_UID=1000
 ARG PYTHON_VERSION=3.11
 ARG WHEEL_VERSION=""
+## Enable Aimstack if requested via ENABLE_AIM set to "true"
+ARG ENABLE_AIM=false
+ARG ENABLE_FMS_ACCELERATION=false
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
@@ -105,10 +108,8 @@ FROM cuda-devel AS python-installations
 ARG WHEEL_VERSION
 ARG USER
 ARG USER_UID
-ARG ENABLE_FMS_ACCELERATION=false
-
-## Enable Aimstack if requested via ENABLE_AIM set to "true"
-ARG ENABLE_AIM=false
+ARG ENABLE_FMS_ACCELERATION
+ARG ENABLE_AIM
 
 RUN dnf install -y git && \
     # perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
@@ -154,6 +155,7 @@ RUN python -m pip uninstall wheel build -y && \
 FROM release-base AS release
 ARG USER
 ARG PYTHON_VERSION
+ARG ENABLE_AIM
 
 RUN mkdir -p /licenses
 COPY LICENSE /licenses/

From c40ae7f1615b95b2d0c5f02206d1a3799b0f615c Mon Sep 17 00:00:00 2001
From: Abhishek Maurya <124327945+Abhishek-TAMU@users.noreply.github.com>
Date: Thu, 12 Sep 2024 21:15:01 -0400
Subject: [PATCH 02/16] fix: remove lm_head post processing (#333)

* fix: Removal of lm head hack

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* set fms_accelerate to true by default

Signed-off-by: Anh Uong <anh.uong@ibm.com>

---------

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
Signed-off-by: Anh Uong <anh.uong@ibm.com>
Co-authored-by: Anh Uong <anh.uong@ibm.com>
---
 build/Dockerfile           |  2 +-
 build/accelerate_launch.py | 92 --------------------------------------
 2 files changed, 1 insertion(+), 93 deletions(-)

diff --git a/build/Dockerfile b/build/Dockerfile
index 4bd9cab6a..ffae818da 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -21,7 +21,7 @@ ARG PYTHON_VERSION=3.11
 ARG WHEEL_VERSION=""
 ## Enable Aimstack if requested via ENABLE_AIM set to "true"
 ARG ENABLE_AIM=false
-ARG ENABLE_FMS_ACCELERATION=false
+ARG ENABLE_FMS_ACCELERATION=true
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index d7753728c..50d8eef0c 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -24,18 +24,13 @@
 import sys
 import traceback
 from pathlib import Path
-import json
 
 # Third Party
 from accelerate.commands.launch import launch_command
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-from torch import bfloat16
 
 # Local
 from build.utils import (
     process_accelerate_launch_args,
-    get_highest_checkpoint,
 )
 from tuning.utils.config_utils import get_json_config
 from tuning.utils.error_logging import (
@@ -43,18 +38,10 @@
     USER_ERROR_EXIT_CODE,
     INTERNAL_ERROR_EXIT_CODE,
 )
-from tuning.data import tokenizer_data_utils
 
 ERROR_LOG = "/dev/termination-log"
 
 
-def get_base_model_from_adapter_config(adapter_config):
-    """Given path to adapter_config.json file, returns the base model name"""
-    with open(adapter_config, "r", encoding="utf-8") as config_file:
-        adapter_config = json.load(config_file)
-        return adapter_config.get("base_model_name_or_path")
-
-
 def main():
     if not os.getenv("TERMINATION_LOG_FILE"):
         os.environ["TERMINATION_LOG_FILE"] = ERROR_LOG
@@ -128,85 +115,6 @@ def main():
         write_termination_log(f"Unhandled exception during training. {e}")
         sys.exit(INTERNAL_ERROR_EXIT_CODE)
 
-    # remove lm_head from granite with llama arch models
-    try:
-        checkpoint_dir = job_config.get("save_model_dir")
-        if not checkpoint_dir:
-            checkpoint_dir = os.path.join(
-                output_dir, get_highest_checkpoint(output_dir)
-            )
-
-        use_flash_attn = job_config.get("use_flash_attn", True)
-        adapter_config_path = os.path.join(checkpoint_dir, "adapter_config.json")
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
-
-        if os.path.exists(adapter_config_path):
-            base_model_path = get_base_model_from_adapter_config(adapter_config_path)
-            base_model = AutoModelForCausalLM.from_pretrained(
-                base_model_path,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-
-            # since the peft library (PEFTModelForCausalLM) does not handle cases
-            # where the model's layers are modified, in our case the embedding layer
-            # is modified, so we resize the backbone model's embedding layer with our own
-            # utility before passing it along to load the PEFT model.
-            tokenizer_data_utils.tokenizer_and_embedding_resize(
-                {}, tokenizer=tokenizer, model=base_model
-            )
-            model = PeftModel.from_pretrained(
-                base_model,
-                checkpoint_dir,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                checkpoint_dir,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-
-        model_arch = model.config.model_type
-        # check that it is a granite model with llama architecture with tied weights
-        # ie. lm_head is duplicate of embeddings
-
-        # a fine tuned model will have params_dict.get("model.embed_tokens.weight")
-        # a prompt adapter has params_dict.get("base_model.model.embed_tokens.weight")
-        # a lora adapter has params_dict.get("base_model.model.model.embed_tokens.weight")
-        if model_arch == "llama" and hasattr(model, "lm_head"):
-            if (
-                # lora tuned model has an addt model layer
-                (
-                    hasattr(model.model, "model")
-                    and model.lm_head.weight.untyped_storage().data_ptr()
-                    == model.model.model.embed_tokens.weight.untyped_storage().data_ptr()
-                )
-                # prompt tuned model or fine tuned model
-                or (
-                    hasattr(model.model, "embed_tokens")
-                    and model.lm_head.weight.untyped_storage().data_ptr()
-                    == model.model.embed_tokens.weight.untyped_storage().data_ptr()
-                )
-            ):
-
-                logging.info("Removing lm_head from checkpoint")
-                del model.lm_head.weight
-
-                if hasattr(model, "lm_head.weight"):
-                    logging.warning("Failed to delete lm_head.weight from model")
-
-                logging.info("Saving checkpoint to %s", output_dir)
-                model.save_pretrained(checkpoint_dir)
-                # save tokenizer with model
-                tokenizer.save_pretrained(checkpoint_dir)
-
-    except Exception as e:  # pylint: disable=broad-except
-        logging.error(traceback.format_exc())
-        write_termination_log(f"Exception encountered removing lm_head from model: {e}")
-        sys.exit(INTERNAL_ERROR_EXIT_CODE)
-
     # The .complete file will signal to users that we are finished copying
     # files over
     if os.path.exists(output_dir):

From 673a79c33546d81593260992fffc9f5e6092bca4 Mon Sep 17 00:00:00 2001
From: Angel Luu <angel.luu@us.ibm.com>
Date: Fri, 30 Aug 2024 08:55:37 -0600
Subject: [PATCH 03/16] Add README

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1859b1b7e..2a1f11deb 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@
   - [Tips on Parameters to Set](#tips-on-parameters-to-set)
 - [Tuning Techniques](#tuning-techniques)
   - [LoRA Tuning Example](#lora-tuning-example)
+  - [qLoRA Tuning Example](#lora-tuning-example)
   - [Prompt Tuning](#prompt-tuning)
   - [Fine Tuning](#fine-tuning)
   - [FMS Acceleration](#fms-acceleration)
@@ -432,6 +433,73 @@ Example 3:
 
 _________________________
 
+
+### qLoRA Tuning Example
+
+This method is similar to LoRA Tuning, but the base model is a quantized model.
+Set `peft_method` to `"lora"`. You can pass any of LoraConfig, see section on [LoRA Example](#lora-tuning-example).
+In addition, you can pass [LoRA quantization config](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/acceleration_configs/quantized_lora_config.py#L62).
+```py
+# to use auto_gptq 4bit lora base layers
+auto_gptq: AutoGPTQLoraConfig = None
+
+# to use auto_gptq 4bit lora base layers
+bnb_qlora: BNBQLoraConfig = None
+```
+
+```py
+class AutoGPTQLoraConfig:
+
+  # auto_gptq supports various kernels, to select the kernel to use.
+  kernel: str = "triton_v2"
+
+  # allow auto_gptq to quantize a model before training commences.
+  # NOTE: currently this is not allowed.
+  from_quantized: bool = True
+
+```
+
+Example command to run:
+
+```bash
+python tuning/sft_trainer.py \
+--model_name_or_path $MODEL_PATH \
+--tokenizer_name_or_path $MODEL_PATH \ # This field is optional and if not specified, tokenizer from model_name_or_path will be used
+--training_data_path $TRAIN_DATA_PATH \
+--output_dir $OUTPUT_PATH \
+--num_train_epochs 40 \
+--per_device_train_batch_size 4 \
+---learning_rate 1e-4 \
+--response_template "\n### Label:" \
+--dataset_text_field "output" \
+--peft_method "lora" \
+--r 8 \
+--lora_dropout 0.05 \
+--lora_alpha 16 \
+--target_modules c_attn c_proj
+--auto_gptq triton_v2
+
+Equally you can pass in a JSON configuration for running tuning. See [build doc](./build/README.md) for more details. The above can also be passed in as JSON:
+```json
+{
+    "model_name_or_path": $MODEL_PATH,
+    "training_data_path": $TRAIN_DATA_PATH,
+    "output_dir": $OUTPUT_PATH,
+    "num_train_epochs": 40.0,
+    "per_device_train_batch_size": 4,
+    "learning_rate": 1e-4,
+    "response_template": "\n### Label:",
+    "dataset_text_field": "output",
+    "peft_method": "lora",
+    "r": 8,
+    "lora_dropout": 0.05,
+    "lora_alpha": 16,
+    "target_modules": ["c_attn", "c_proj"]
+    "auto_gptq": ["triton_v2"]
+}
+```
+_________________________
+
 ### Prompt Tuning:
 
 Specify `peft_method` to `'pt'` . You can additionally pass any arguments from [PromptTuningConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L63).
@@ -676,4 +744,4 @@ Further details on enabling and using the trackers mentioned above can be found
 
 [Prompt Tuning on Twitter Complaints](examples/prompt_tuning_twitter_complaints/README.md)
 
-A good simple example can be found [here](examples/kfto-kueue-sft-trainer.yaml) which launches a Kubernetes-native `PyTorchJob` using the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator/) with [Kueue](https://github.com/kubernetes-sigs/kueue) for the queue management of tuning jobs.
+A good simple example can be found [here](examples/kfto-kueue-sft-trainer.yaml) which launches a Kubernetes-native `PyTorchJob` using the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator/) with [Kueue](https://github.com/kubernetes-sigs/kueue) for the queue management of tuning jobs.
\ No newline at end of file

From 1a249408022528e97e6abce885f7a57af2138e5c Mon Sep 17 00:00:00 2001
From: Hari <harikrishmenon@gmail.com>
Date: Tue, 3 Sep 2024 23:21:45 +0530
Subject: [PATCH 04/16] fix: need to pass skip_prepare_dataset for pretokenized
 dataset due to breaking change in HF SFTTrainer (#326)

* fix: need to pass skip_prepare_dataset for pretokenized dataset due to breaking change in HF SFTTrainer

Signed-off-by: Harikrishnan Balagopal <harikrishmenon@gmail.com>

* fix: wrong dataset paths, was using non-tokenized data in pre-tokenized dataset tests

Signed-off-by: Harikrishnan Balagopal <harikrishmenon@gmail.com>

---------

Signed-off-by: Harikrishnan Balagopal <harikrishmenon@gmail.com>
Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 tests/test_sft_trainer.py | 5 ++---
 tuning/sft_trainer.py     | 7 +++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index fd36785ce..251f6d6b9 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -35,7 +35,6 @@
     EMPTY_DATA,
     MALFORMATTED_DATA,
     MODEL_NAME,
-    TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON,
     TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL,
     TWITTER_COMPLAINTS_DATA_JSON,
     TWITTER_COMPLAINTS_DATA_JSONL,
@@ -850,8 +849,8 @@ def test_run_with_good_experimental_metadata():
 @pytest.mark.parametrize(
     "dataset_path",
     [
-        TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL,
-        TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON,
+        TWITTER_COMPLAINTS_TOKENIZED_JSONL,
+        TWITTER_COMPLAINTS_TOKENIZED_JSON,
     ],
 )
 ### Tests for pretokenized data
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index b5e6cb62e..bc1937c32 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -66,6 +66,7 @@
 from tuning.utils.preprocessing_utils import (
     format_dataset,
     get_data_collator,
+    is_pretokenized_dataset,
     validate_data_args,
 )
 
@@ -318,6 +319,11 @@ def train(
     }
     training_args = SFTConfig(**transformer_kwargs)
 
+    dataset_kwargs = {}
+    if is_pretokenized_dataset(
+        data_args.training_data_path or data_args.validation_data_path
+    ):
+        dataset_kwargs["skip_prepare_dataset"] = True
     trainer = SFTTrainer(
         model=model,
         tokenizer=tokenizer,
@@ -330,6 +336,7 @@ def train(
         max_seq_length=max_seq_length,
         callbacks=trainer_callbacks,
         peft_config=peft_config,
+        dataset_kwargs=dataset_kwargs,
     )
 
     # We track additional metrics and experiment metadata after trainer object creation

From c5f471afe5fb8e7b7b971c2567c6d2bf54fe9140 Mon Sep 17 00:00:00 2001
From: Anh Uong <anh.uong@ibm.com>
Date: Wed, 4 Sep 2024 10:05:10 -0600
Subject: [PATCH 05/16] feat: install fms-acceleration to enable qlora (#284)

* add fms-acceleration deps and pytorh layer with cuda

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* add build args needed

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* allow transformers v4.40 for fms-acceleration

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* set wider transformers version

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* remove nvidia stage

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* add gcc and dev tools

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* install c compiler and python deps

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* remove transformers lower bound and dev deps

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* install python-devel by version

Signed-off-by: Anh Uong <anh.uong@ibm.com>

* update python installations

Signed-off-by: Anh Uong <anh.uong@ibm.com>

---------

Signed-off-by: Anh-Uong <anh.uong@ibm.com>
Signed-off-by: Anh Uong <anh.uong@ibm.com>
Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 build/Dockerfile | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/build/Dockerfile b/build/Dockerfile
index 281f80dc6..116044f33 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -27,9 +27,10 @@ ARG PYTHON_VERSION
 ARG USER
 ARG USER_UID
 
+# Note this works for 3.9, 3.11, 3.12
 RUN dnf remove -y --disableplugin=subscription-manager \
         subscription-manager \
-    && dnf install -y python${PYTHON_VERSION} procps \
+    && dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
     && ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
     && python -m ensurepip --upgrade \
     && python -m pip install --upgrade pip \
@@ -104,6 +105,7 @@ FROM cuda-devel AS python-installations
 ARG WHEEL_VERSION
 ARG USER
 ARG USER_UID
+ARG ENABLE_FMS_ACCELERATION=false
 
 ## Enable Aimstack if requested via ENABLE_AIM set to "true"
 ARG ENABLE_AIM=false
@@ -132,12 +134,19 @@ RUN if [[ -z "${WHEEL_VERSION}" ]]; \
 RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
     python -m pip install --user wheel && \
     python -m pip install --user "$(head bdist_name)" && \
-    python -m pip install --user "$(head bdist_name)[flash-attn]" && \
-    if [[ "${ENABLE_AIM}" == "true" ]]; then \
+    python -m pip install --user "$(head bdist_name)[flash-attn]"
+
+RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
+        python -m pip install --user "$(head bdist_name)[fms-accel]"; \
+        python -m fms_acceleration.cli install fms_acceleration_peft; \
+    fi
+
+RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
         python -m pip install --user "$(head bdist_name)[aim]"; \
-    fi && \
+    fi
+
     # Clean up the wheel module. It's only needed by flash-attn install
-    python -m pip uninstall wheel build -y && \
+RUN python -m pip uninstall wheel build -y && \
     # Cleanup the bdist whl file
     rm $(head bdist_name) /tmp/bdist_name
 

From 9f23d9e22e74f85c57201f22e213f7733e63710c Mon Sep 17 00:00:00 2001
From: Angel Luu <angel.luu@us.ibm.com>
Date: Wed, 4 Sep 2024 17:06:42 -0600
Subject: [PATCH 06/16] Update for target modules

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2a1f11deb..c164b8dc9 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
   - [Tips on Parameters to Set](#tips-on-parameters-to-set)
 - [Tuning Techniques](#tuning-techniques)
   - [LoRA Tuning Example](#lora-tuning-example)
-  - [qLoRA Tuning Example](#lora-tuning-example)
+  - [qLoRA Tuning Example](#qlora-tuning-example)
   - [Prompt Tuning](#prompt-tuning)
   - [Fine Tuning](#fine-tuning)
   - [FMS Acceleration](#fms-acceleration)
@@ -478,6 +478,7 @@ python tuning/sft_trainer.py \
 --lora_alpha 16 \
 --target_modules c_attn c_proj
 --auto_gptq triton_v2
+```
 
 Equally you can pass in a JSON configuration for running tuning. See [build doc](./build/README.md) for more details. The above can also be passed in as JSON:
 ```json
@@ -498,6 +499,11 @@ Equally you can pass in a JSON configuration for running tuning. See [build doc]
     "auto_gptq": ["triton_v2"]
 }
 ```
+
+Similarly to LoRA, the `target_modules` are the names of the modules to apply the adapter to. See the LoRA [section](#lora-tuning-example) on `target_modules` for more info.
+
+Note that with LoRA tuning technique, setting `all-linear` on `target_modules` returns linear modules. And with qLoRA tuning technique, `all-linear` returns all quant linear modules, excluding `lm_head`.
+
 _________________________
 
 ### Prompt Tuning:

From 82005048887389212b1411178922bfb223ed3a18 Mon Sep 17 00:00:00 2001
From: Padmanabha V Seshadri <seshapad@in.ibm.com>
Date: Fri, 6 Sep 2024 02:39:28 +0530
Subject: [PATCH 07/16] feat: Migrating the trainer controller to python logger
 (#309)

* fix: Migrate tranformer logging to python logging

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Migrate tranformer logging to python logging

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Removed unwanted file

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Log levels obtained from reversing the dictionary

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Format issues

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Variable names made meaningful

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Removed unwanted log line

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Added name to getLogger

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Added default logging level to DEBUG

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Added default logging level to DEBUG

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Added default logging level to DEBUG

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Removed setLevel() calls from the packages

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

* fix: Format issues resolved

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>

---------

Signed-off-by: Padmanabha V Seshadri <seshapad@in.ibm.com>
Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 tuning/trainercontroller/callback.py                 | 10 +++++-----
 .../controllermetrics/trainingstate.py               |  3 ---
 tuning/trainercontroller/operations/hfcontrols.py    |  4 +++-
 tuning/trainercontroller/operations/logcontrol.py    | 12 ++++++------
 tuning/trainercontroller/operations/operation.py     |  6 ++----
 tuning/trainercontroller/patience.py                 |  6 ++++--
 6 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/tuning/trainercontroller/callback.py b/tuning/trainercontroller/callback.py
index fad1bbf70..a1b3397d7 100644
--- a/tuning/trainercontroller/callback.py
+++ b/tuning/trainercontroller/callback.py
@@ -18,6 +18,7 @@
 # Standard
 from typing import Dict, List, Union
 import inspect
+import logging
 import os
 import re
 
@@ -29,7 +30,6 @@
     TrainerState,
     TrainingArguments,
 )
-from transformers.utils import logging
 import yaml
 
 # Local
@@ -45,7 +45,7 @@
 from tuning.trainercontroller.patience import PatienceControl
 from tuning.utils.evaluator import MetricUnavailableError, RuleEvaluator
 
-logger = logging.get_logger(__name__)
+logger = logging.getLogger(__name__)
 
 # Configuration keys
 CONTROLLER_METRICS_KEY = "controller_metrics"
@@ -66,7 +66,7 @@
 DEFAULT_OPERATIONS = {"operations": [{"name": "hfcontrols", "class": "HFControls"}]}
 DEFAULT_METRICS = {}
 DEFAULT_CONFIG = {}
-DEFAULT_TRIGGER_LOG_LEVEL = "debug"
+DEFAULT_TRIGGER_LOG_LEVEL = "DEBUG"
 
 # pylint: disable=too-many-instance-attributes
 class TrainerControllerCallback(TrainerCallback):
@@ -305,7 +305,7 @@ def on_init_end(
         kwargs["state"] = state
         kwargs["control"] = control
 
-        log_levels = logging.get_log_levels_dict()
+        log_levels = dict((value, key) for key, value in logging._levelToName.items())
         # Check if there any metrics listed in the configuration
         if (
             CONTROLLER_METRICS_KEY not in self.trainer_controller_config
@@ -407,7 +407,7 @@ def on_init_end(
                         control.config = controller[CONTROLLER_CONFIG_KEY]
                         config_log_level_str = control.config.get(
                             CONTROLLER_CONFIG_TRIGGER_LOG_LEVEL, config_log_level_str
-                        )
+                        ).upper()
                         if config_log_level_str not in log_levels:
                             logger.warning(
                                 "Incorrect trigger log-level [%s] specified in the config."
diff --git a/tuning/trainercontroller/controllermetrics/trainingstate.py b/tuning/trainercontroller/controllermetrics/trainingstate.py
index 8dc276339..06da4035a 100644
--- a/tuning/trainercontroller/controllermetrics/trainingstate.py
+++ b/tuning/trainercontroller/controllermetrics/trainingstate.py
@@ -21,13 +21,10 @@
 
 # Third Party
 from transformers import TrainerState
-from transformers.utils import logging
 
 # Local
 from tuning.trainercontroller.controllermetrics.metricshandler import MetricHandler
 
-logger = logging.get_logger(__name__)
-
 
 class TrainingState(MetricHandler):
     """Implements the controller metric which exposes the trainer state"""
diff --git a/tuning/trainercontroller/operations/hfcontrols.py b/tuning/trainercontroller/operations/hfcontrols.py
index 0548b4c12..90988c16a 100644
--- a/tuning/trainercontroller/operations/hfcontrols.py
+++ b/tuning/trainercontroller/operations/hfcontrols.py
@@ -10,6 +10,8 @@
 # Local
 from .operation import Operation
 
+logger = logging.getLogger(__name__)
+
 
 class HFControls(Operation):
     """Implements the control actions for the HuggingFace controls in
@@ -37,7 +39,7 @@ def control_action(self, control: TrainerControl, **kwargs):
             control: TrainerControl. Data class for controls.
             kwargs: List of arguments (key, value)-pairs
         """
-        logging.debug("Arguments passed to control_action: %s", repr(kwargs))
+        logger.debug("Arguments passed to control_action: %s", repr(kwargs))
         frame_info = inspect.currentframe().f_back
         arg_values = inspect.getargvalues(frame_info)
         setattr(control, arg_values.locals["action"], True)
diff --git a/tuning/trainercontroller/operations/logcontrol.py b/tuning/trainercontroller/operations/logcontrol.py
index 385de3b4d..eabb420c9 100644
--- a/tuning/trainercontroller/operations/logcontrol.py
+++ b/tuning/trainercontroller/operations/logcontrol.py
@@ -1,12 +1,13 @@
+# Standard
+import logging
+
 # Third Party
 from transformers import TrainingArguments
-from transformers.utils import logging
 
 # Local
 from .operation import Operation
 
-logger = logging.get_logger(__name__)
-logger.setLevel(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
 
 
 class LogControl(Operation):
@@ -20,12 +21,11 @@ def __init__(self, log_format: str, log_level: str, **kwargs):
         Args:
             kwargs: List of arguments (key, value)-pairs
         """
-        log_levels = logging.get_log_levels_dict()
-        if log_level not in log_levels:
+        self.log_level = getattr(logging, log_level.upper(), None)
+        if not isinstance(self.log_level, int):
             raise ValueError(
                 "Specified log_level [%s] is invalid for LogControl" % (log_level)
             )
-        self.log_level = log_levels[log_level]
         self.log_format = log_format
         super().__init__(**kwargs)
 
diff --git a/tuning/trainercontroller/operations/operation.py b/tuning/trainercontroller/operations/operation.py
index 70805a015..f6b4884fc 100644
--- a/tuning/trainercontroller/operations/operation.py
+++ b/tuning/trainercontroller/operations/operation.py
@@ -1,12 +1,10 @@
 # Standard
 import abc
 import inspect
+import logging
 import re
 
-# Third Party
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class Operation(metaclass=abc.ABCMeta):
diff --git a/tuning/trainercontroller/patience.py b/tuning/trainercontroller/patience.py
index ecdb0699a..bda91363c 100644
--- a/tuning/trainercontroller/patience.py
+++ b/tuning/trainercontroller/patience.py
@@ -31,6 +31,8 @@
 # will be exceeded afer the fifth event.
 MODE_NO_RESET_ON_FAILURE = "no_reset_on_failure"
 
+logger = logging.getLogger(__name__)
+
 
 class PatienceControl:
     """Implements the patience control for every rule"""
@@ -49,7 +51,7 @@ def should_tolerate(
         elif self._mode == MODE_RESET_ON_FAILURE:
             self._patience_counter = 0
         if self._patience_counter <= self._patience_threshold:
-            logging.debug(
+            logger.debug(
                 "Control {} triggered on event {}: "
                 "Enforcing patience [patience_counter = {:.2f}, "
                 "patience_threshold = {:.2f}]".format(
@@ -60,7 +62,7 @@ def should_tolerate(
                 )
             )
             return True
-        logging.debug(
+        logger.debug(
             "Control {} triggered on event {}: "
             "Exceeded patience [patience_counter = {:.2f}, "
             "patience_threshold = {:.2f}]".format(

From 761cde41d4d5331cbfab9f1e64b310b19b5c0c98 Mon Sep 17 00:00:00 2001
From: Hari <harikrishmenon@gmail.com>
Date: Tue, 10 Sep 2024 20:54:34 +0530
Subject: [PATCH 08/16] fix: remove fire for handling CLI args (#324)

Signed-off-by: Mehant Kammakomati <mehant.kammakomati2@ibm.com>
Signed-off-by: Harikrishnan Balagopal <harikrishmenon@gmail.com>
Signed-off-by: Anh Uong <anh.uong@ibm.com>
Co-authored-by: Mehant Kammakomati <mehant.kammakomati2@ibm.com>
Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 pyproject.toml        | 1 -
 tuning/sft_trainer.py | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e31192470..2675f49b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,6 @@ dependencies = [
 "trl>=0.9.3,<1.0",
 "peft>=0.8.0,<0.13",
 "datasets>=2.15.0,<3.0",
-"fire>=0.5.0,<1.0",
 "simpleeval>=0.9.13,<1.0",
 ]
 
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index bc1937c32..2ab8f7de0 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -37,7 +37,6 @@
 )
 from transformers.utils import is_accelerate_available
 from trl import SFTConfig, SFTTrainer
-import fire
 import transformers
 
 # Local
@@ -515,7 +514,7 @@ def parse_arguments(parser, json_config=None):
     )
 
 
-def main(**kwargs):  # pylint: disable=unused-argument
+def main():
     parser = get_parser()
     logger = logging.getLogger()
     job_config = get_json_config()
@@ -636,4 +635,4 @@ def main(**kwargs):  # pylint: disable=unused-argument
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
+    main()

From 25abbca7e9f7c2fd837bbeeef85515e1a7ae7d25 Mon Sep 17 00:00:00 2001
From: Anh Uong <anh.uong@ibm.com>
Date: Tue, 10 Sep 2024 15:07:07 -0600
Subject: [PATCH 09/16] dep: cap transformers version (#335)

- FSDP bug in accelerate v0.34

Signed-off-by: Anh Uong <anh.uong@ibm.com>
Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2675f49b4..332135020 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ classifiers=[
 ]
 dependencies = [
 "numpy>=1.26.4,<2.0",
-"accelerate>=0.20.3,<0.40",
+"accelerate>=0.20.3,<0.34",
 "transformers>4.41,<5.0",
 "torch>=2.2.0,<3.0",
 "sentencepiece>=0.1.99,<0.3",

From e1543348bb42646665cbef3824f4dc1b5fd39938 Mon Sep 17 00:00:00 2001
From: Will <mwjohnson728@gmail.com>
Date: Tue, 10 Sep 2024 17:19:22 -0400
Subject: [PATCH 10/16] deps: Add protobuf to enable compatibility with certain
 models (#336)

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 332135020..aae1a9dd7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
 "tqdm>=4.66.2,<5.0",
 "trl>=0.9.3,<1.0",
 "peft>=0.8.0,<0.13",
+"protobuf>=5.28.0,<6.0.0",
 "datasets>=2.15.0,<3.0",
 "simpleeval>=0.9.13,<1.0",
 ]

From 0c6c47659775bf6f00872cba512368ad774f94d6 Mon Sep 17 00:00:00 2001
From: Angel Luu <angel.luu@us.ibm.com>
Date: Wed, 11 Sep 2024 10:23:44 -0600
Subject: [PATCH 11/16] Add more details on qLORA

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 README.md | 43 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index c164b8dc9..26c1da347 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
   - [Tips on Parameters to Set](#tips-on-parameters-to-set)
 - [Tuning Techniques](#tuning-techniques)
   - [LoRA Tuning Example](#lora-tuning-example)
-  - [qLoRA Tuning Example](#qlora-tuning-example)
+  - [GPTQ-LoRA with AutoGPTQ Tuning Example](#gptq-lora-with-autogptq-tuning-example)
   - [Prompt Tuning](#prompt-tuning)
   - [Fine Tuning](#fine-tuning)
   - [FMS Acceleration](#fms-acceleration)
@@ -434,30 +434,16 @@ Example 3:
 _________________________
 
 
-### qLoRA Tuning Example
+### GPTQ-LoRA with AutoGPTQ Tuning Example
 
-This method is similar to LoRA Tuning, but the base model is a quantized model.
-Set `peft_method` to `"lora"`. You can pass any of LoraConfig, see section on [LoRA Example](#lora-tuning-example).
-In addition, you can pass [LoRA quantization config](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/acceleration_configs/quantized_lora_config.py#L62).
-```py
-# to use auto_gptq 4bit lora base layers
-auto_gptq: AutoGPTQLoraConfig = None
-
-# to use auto_gptq 4bit lora base layers
-bnb_qlora: BNBQLoraConfig = None
-```
-
-```py
-class AutoGPTQLoraConfig:
+This method is similar to LoRA Tuning, but the base model is a quantized model. We currently only support GPTQ-LoRA model that has been quantized with 4-bit AutoGPTQ technique. Bits-and-Bytes (BNB) quantized LoRA is not yet enabled.
+The qLoRA tuning technique is enabled via the [fms-acceleration](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/README.md#fms-acceleration) package.
+You can see details on a sample configuration of Accelerated GPTQ-LoRA [here](https://github.com/foundation-model-stack/fms-acceleration/blob/main/sample-configurations/accelerated-peft-autogptq-sample-configuration.yaml).
 
-  # auto_gptq supports various kernels, to select the kernel to use.
-  kernel: str = "triton_v2"
 
-  # allow auto_gptq to quantize a model before training commences.
-  # NOTE: currently this is not allowed.
-  from_quantized: bool = True
+To use GPTQ-LoRA technique, you can set the `quantized_lora_config` defined [here](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/acceleration_configs/quantized_lora_config.py). See the Notes section of FMS Acceleration doc [below](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/README.md#fms-acceleration) for usage. The only kernel we are supporting currently is `triton_v2`.
 
-```
+In addition, LoRA tuning technique is required to be used, set `peft_method` to `"lora"` and pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21).
 
 Example command to run:
 
@@ -469,18 +455,21 @@ python tuning/sft_trainer.py \
 --output_dir $OUTPUT_PATH \
 --num_train_epochs 40 \
 --per_device_train_batch_size 4 \
----learning_rate 1e-4 \
+--learning_rate 1e-4 \
 --response_template "\n### Label:" \
 --dataset_text_field "output" \
 --peft_method "lora" \
 --r 8 \
 --lora_dropout 0.05 \
 --lora_alpha 16 \
---target_modules c_attn c_proj
---auto_gptq triton_v2
+--target_modules c_attn c_proj \
+--auto_gptq triton_v2 \ # setting quantized_lora_config 
+--torch_dtype float16 \ # need this for triton_v2
+--fp16 \ # need this for triton_v2
 ```
 
 Equally you can pass in a JSON configuration for running tuning. See [build doc](./build/README.md) for more details. The above can also be passed in as JSON:
+
 ```json
 {
     "model_name_or_path": $MODEL_PATH,
@@ -495,8 +484,10 @@ Equally you can pass in a JSON configuration for running tuning. See [build doc]
     "r": 8,
     "lora_dropout": 0.05,
     "lora_alpha": 16,
-    "target_modules": ["c_attn", "c_proj"]
-    "auto_gptq": ["triton_v2"]
+    "target_modules": ["c_attn", "c_proj"],
+    "auto_gptq": ["triton_v2"], // setting quantized_lora_config
+    "torch_dtype": "float16", // need this for triton_v2
+    "fp16": true // need this for triton_v2
 }
 ```
 

From 427202f65434f1447a6832f801f4ead7fffa26da Mon Sep 17 00:00:00 2001
From: Anh Uong <anh.uong@ibm.com>
Date: Wed, 4 Sep 2024 14:48:41 -0600
Subject: [PATCH 12/16] add enable_aim build args in all stages

Signed-off-by: Anh Uong <anh.uong@ibm.com>
Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 build/Dockerfile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/build/Dockerfile b/build/Dockerfile
index 116044f33..4bd9cab6a 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -19,6 +19,9 @@ ARG USER=tuning
 ARG USER_UID=1000
 ARG PYTHON_VERSION=3.11
 ARG WHEEL_VERSION=""
+## Enable Aimstack if requested via ENABLE_AIM set to "true"
+ARG ENABLE_AIM=false
+ARG ENABLE_FMS_ACCELERATION=false
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
@@ -105,10 +108,8 @@ FROM cuda-devel AS python-installations
 ARG WHEEL_VERSION
 ARG USER
 ARG USER_UID
-ARG ENABLE_FMS_ACCELERATION=false
-
-## Enable Aimstack if requested via ENABLE_AIM set to "true"
-ARG ENABLE_AIM=false
+ARG ENABLE_FMS_ACCELERATION
+ARG ENABLE_AIM
 
 RUN dnf install -y git && \
     # perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
@@ -154,6 +155,7 @@ RUN python -m pip uninstall wheel build -y && \
 FROM release-base AS release
 ARG USER
 ARG PYTHON_VERSION
+ARG ENABLE_AIM
 
 RUN mkdir -p /licenses
 COPY LICENSE /licenses/

From b15a07b5e67be7499c51716ff5132b63193d209f Mon Sep 17 00:00:00 2001
From: Abhishek Maurya <124327945+Abhishek-TAMU@users.noreply.github.com>
Date: Thu, 12 Sep 2024 21:15:01 -0400
Subject: [PATCH 13/16] fix: remove lm_head post processing (#333)

* fix: Removal of lm head hack

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* set fms_accelerate to true by default

Signed-off-by: Anh Uong <anh.uong@ibm.com>

---------

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
Signed-off-by: Anh Uong <anh.uong@ibm.com>
Co-authored-by: Anh Uong <anh.uong@ibm.com>
Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 build/Dockerfile           |  2 +-
 build/accelerate_launch.py | 92 --------------------------------------
 2 files changed, 1 insertion(+), 93 deletions(-)

diff --git a/build/Dockerfile b/build/Dockerfile
index 4bd9cab6a..ffae818da 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -21,7 +21,7 @@ ARG PYTHON_VERSION=3.11
 ARG WHEEL_VERSION=""
 ## Enable Aimstack if requested via ENABLE_AIM set to "true"
 ARG ENABLE_AIM=false
-ARG ENABLE_FMS_ACCELERATION=false
+ARG ENABLE_FMS_ACCELERATION=true
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index d7753728c..50d8eef0c 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -24,18 +24,13 @@
 import sys
 import traceback
 from pathlib import Path
-import json
 
 # Third Party
 from accelerate.commands.launch import launch_command
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-from torch import bfloat16
 
 # Local
 from build.utils import (
     process_accelerate_launch_args,
-    get_highest_checkpoint,
 )
 from tuning.utils.config_utils import get_json_config
 from tuning.utils.error_logging import (
@@ -43,18 +38,10 @@
     USER_ERROR_EXIT_CODE,
     INTERNAL_ERROR_EXIT_CODE,
 )
-from tuning.data import tokenizer_data_utils
 
 ERROR_LOG = "/dev/termination-log"
 
 
-def get_base_model_from_adapter_config(adapter_config):
-    """Given path to adapter_config.json file, returns the base model name"""
-    with open(adapter_config, "r", encoding="utf-8") as config_file:
-        adapter_config = json.load(config_file)
-        return adapter_config.get("base_model_name_or_path")
-
-
 def main():
     if not os.getenv("TERMINATION_LOG_FILE"):
         os.environ["TERMINATION_LOG_FILE"] = ERROR_LOG
@@ -128,85 +115,6 @@ def main():
         write_termination_log(f"Unhandled exception during training. {e}")
         sys.exit(INTERNAL_ERROR_EXIT_CODE)
 
-    # remove lm_head from granite with llama arch models
-    try:
-        checkpoint_dir = job_config.get("save_model_dir")
-        if not checkpoint_dir:
-            checkpoint_dir = os.path.join(
-                output_dir, get_highest_checkpoint(output_dir)
-            )
-
-        use_flash_attn = job_config.get("use_flash_attn", True)
-        adapter_config_path = os.path.join(checkpoint_dir, "adapter_config.json")
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
-
-        if os.path.exists(adapter_config_path):
-            base_model_path = get_base_model_from_adapter_config(adapter_config_path)
-            base_model = AutoModelForCausalLM.from_pretrained(
-                base_model_path,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-
-            # since the peft library (PEFTModelForCausalLM) does not handle cases
-            # where the model's layers are modified, in our case the embedding layer
-            # is modified, so we resize the backbone model's embedding layer with our own
-            # utility before passing it along to load the PEFT model.
-            tokenizer_data_utils.tokenizer_and_embedding_resize(
-                {}, tokenizer=tokenizer, model=base_model
-            )
-            model = PeftModel.from_pretrained(
-                base_model,
-                checkpoint_dir,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                checkpoint_dir,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=bfloat16 if use_flash_attn else None,
-            )
-
-        model_arch = model.config.model_type
-        # check that it is a granite model with llama architecture with tied weights
-        # ie. lm_head is duplicate of embeddings
-
-        # a fine tuned model will have params_dict.get("model.embed_tokens.weight")
-        # a prompt adapter has params_dict.get("base_model.model.embed_tokens.weight")
-        # a lora adapter has params_dict.get("base_model.model.model.embed_tokens.weight")
-        if model_arch == "llama" and hasattr(model, "lm_head"):
-            if (
-                # lora tuned model has an addt model layer
-                (
-                    hasattr(model.model, "model")
-                    and model.lm_head.weight.untyped_storage().data_ptr()
-                    == model.model.model.embed_tokens.weight.untyped_storage().data_ptr()
-                )
-                # prompt tuned model or fine tuned model
-                or (
-                    hasattr(model.model, "embed_tokens")
-                    and model.lm_head.weight.untyped_storage().data_ptr()
-                    == model.model.embed_tokens.weight.untyped_storage().data_ptr()
-                )
-            ):
-
-                logging.info("Removing lm_head from checkpoint")
-                del model.lm_head.weight
-
-                if hasattr(model, "lm_head.weight"):
-                    logging.warning("Failed to delete lm_head.weight from model")
-
-                logging.info("Saving checkpoint to %s", output_dir)
-                model.save_pretrained(checkpoint_dir)
-                # save tokenizer with model
-                tokenizer.save_pretrained(checkpoint_dir)
-
-    except Exception as e:  # pylint: disable=broad-except
-        logging.error(traceback.format_exc())
-        write_termination_log(f"Exception encountered removing lm_head from model: {e}")
-        sys.exit(INTERNAL_ERROR_EXIT_CODE)
-
     # The .complete file will signal to users that we are finished copying
     # files over
     if os.path.exists(output_dir):

From 5dd5494a4e0fae9bd568bb868925b80dac6653ae Mon Sep 17 00:00:00 2001
From: Angel Luu <an317gel@gmail.com>
Date: Mon, 16 Sep 2024 09:45:40 -0600
Subject: [PATCH 14/16] feat: Add deps to evaluate qLora tuned model (#312)

* Add support to load qLora tuned model in run_inference.py script

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

* Remove comment

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

* Disable gptq by default

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

* Remove the gptq-dev install in Dockerfile

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

* Rename gptq-dev package from gptq

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

* Add comments in run_inference.py

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

* Update device to cuda

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

* Add in the case that there's no adapter found

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

* Use torch.float16 for quantized

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>

---------

Signed-off-by: Angel Luu <angel.luu@us.ibm.com>
---
 pyproject.toml           |  1 +
 scripts/run_inference.py | 73 +++++++++++++++++++++++++++++++---------
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index aae1a9dd7..fcb049821 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ dev = ["wheel>=0.42.0,<1.0", "packaging>=23.2,<25", "ninja>=1.11.1.1,<2.0", "sci
 flash-attn = ["flash-attn>=2.5.3,<3.0"]
 aim = ["aim>=3.19.0,<4.0"]
 fms-accel = ["fms-acceleration>=0.1"]
+gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]
 
 
 [tool.setuptools.packages.find]
diff --git a/scripts/run_inference.py b/scripts/run_inference.py
index d64bf926b..7e4465cac 100644
--- a/scripts/run_inference.py
+++ b/scripts/run_inference.py
@@ -30,7 +30,7 @@
 # Third Party
 from peft import PeftModel
 from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 import torch
 
 # Local
@@ -176,6 +176,8 @@ def load(
             else {}
         )
         tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
+        device = "cuda" if torch.cuda.is_available() else None
+        print(f"Inferred device: {device}")
         # Apply the configs to the adapter config of this model; if no overrides
         # are provided, then the context manager doesn't have any effect.
         try:
@@ -183,13 +185,36 @@ def load(
                 try:
                     if base_model_name_or_path is None:
                         raise ValueError("base_model_name_or_path has to be passed")
-                    base_model = AutoModelForCausalLM.from_pretrained(
-                        base_model_name_or_path,
-                        attn_implementation="flash_attention_2"
-                        if use_flash_attn
-                        else None,
-                        torch_dtype=torch.bfloat16 if use_flash_attn else None,
-                    )
+
+                    if (
+                        has_quantized_config(base_model_name_or_path)
+                        and device == "cuda"
+                    ):
+                        # Using GPTQConfig from HF, avail params are here
+                        # https://huggingface.co/docs/transformers/en/main_classes/quantization#transformers.GPTQConfig
+                        # We only support 4-bit AutoGPTQ, so setting bits to 4
+                        # setting exllama kernel to version 2 as it's a faster kernel
+                        gptq_config = GPTQConfig(bits=4, exllama_config={"version": 2})
+
+                        # Since we are using exllama kernel, we need torch.float16 as torch_dtype
+                        base_model = AutoModelForCausalLM.from_pretrained(
+                            base_model_name_or_path,
+                            attn_implementation="flash_attention_2"
+                            if use_flash_attn
+                            else None,
+                            device_map=device,
+                            torch_dtype=torch.float16,
+                            quantization_config=gptq_config,
+                        )
+                    else:
+                        base_model = AutoModelForCausalLM.from_pretrained(
+                            base_model_name_or_path,
+                            attn_implementation="flash_attention_2"
+                            if use_flash_attn
+                            else None,
+                            torch_dtype=torch.bfloat16 if use_flash_attn else None,
+                        )
+
                     # since the peft library (PEFTModelForCausalLM) does not handle cases
                     # where the model's layers are modified, in our case the embedding layer
                     # is modified, so we resize the backbone model's embedding layer with our own
@@ -211,14 +236,28 @@ def load(
         except FileNotFoundError:
             print("No adapter config found! Loading as a merged model...")
             # Unable to find the adapter config; fall back to loading as a merged model
-            model = AutoModelForCausalLM.from_pretrained(
-                checkpoint_path,
-                attn_implementation="flash_attention_2" if use_flash_attn else None,
-                torch_dtype=torch.bfloat16 if use_flash_attn else None,
-            )
+            if has_quantized_config(checkpoint_path) and device == "cuda":
+                # Using GPTQConfig from HF, avail params are here
+                # https://huggingface.co/docs/transformers/en/main_classes/quantization#transformers.GPTQConfig
+                # We only support 4-bit AutoGPTQ, so setting bits to 4
+                # setting exllama kernel to version 2 as it's a faster kernel
+                gptq_config = GPTQConfig(bits=4, exllama_config={"version": 2})
+
+                # Since we are using exllama kernel, we need torch.float16 as torch_dtype
+                model = AutoModelForCausalLM.from_pretrained(
+                    checkpoint_path,
+                    attn_implementation="flash_attention_2" if use_flash_attn else None,
+                    device_map=device,
+                    torch_dtype=torch.float16,
+                    quantization_config=gptq_config,
+                )
+            else:
+                model = AutoModelForCausalLM.from_pretrained(
+                    checkpoint_path,
+                    attn_implementation="flash_attention_2" if use_flash_attn else None,
+                    torch_dtype=torch.bfloat16 if use_flash_attn else None,
+                )
 
-        device = "cuda" if torch.cuda.is_available() else None
-        print(f"Inferred device: {device}")
         model.to(device)
         return cls(model, tokenizer, device)
 
@@ -327,5 +366,9 @@ def main():
     print(f"Exported results to: {args.out_file}")
 
 
+def has_quantized_config(model_path: str):
+    return os.path.exists(os.path.join(model_path, "quantize_config.json"))
+
+
 if __name__ == "__main__":
     main()

From cd6ba00623c2b1598f2082b775161911970c5eba Mon Sep 17 00:00:00 2001
From: Abhishek Maurya <124327945+Abhishek-TAMU@users.noreply.github.com>
Date: Mon, 16 Sep 2024 12:27:01 -0400
Subject: [PATCH 15/16] feat: Add support for smoothly resuming training from a
 saved checkpoint (#300)

* Add feature of resume training

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* Remove lastcheckpoints conditions

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* PR Changes

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* feat:resume tuning based on value from user's flag

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* test:added unit tests for resume tuning feature

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* test: PR changes of resume from checkpoint feature

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* fix: Modified test fn descripts, added readme

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

---------

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
Co-authored-by: Anh Uong <anh.uong@ibm.com>
---
 .pylintrc                 |   2 +-
 README.md                 |   5 +
 tests/test_sft_trainer.py | 208 ++++++++++++++++++++++++++++++++++++++
 tuning/sft_trainer.py     |  22 +++-
 4 files changed, 234 insertions(+), 3 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index e94869511..d6f8a5d6c 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -333,7 +333,7 @@ indent-string='    '
 max-line-length=100
 
 # Maximum number of lines in a module.
-max-module-lines=1100
+max-module-lines=1200
 
 # Allow the body of a class to be on the same line as the declaration if body
 # contains single statement.
diff --git a/README.md b/README.md
index 26c1da347..7fd8fd5d7 100644
--- a/README.md
+++ b/README.md
@@ -278,6 +278,11 @@ You can set `output_dir` to a local directory and set `save_model_dir` to COS to
 
 In order to achieve the fastest train time, set `save_strategy="no"`, as saving no checkpoints except for the final model will remove intermediate write operations all together.
 
+#### Resuming tuning from checkpoints
+If the output directory already contains checkpoints, tuning will automatically resume from the latest checkpoint in the directory specified by the `output_dir` flag. To start tuning from scratch and ignore existing checkpoints, set the `resume_from_checkpoint` flag to False.
+
+You can also use the resume_from_checkpoint flag to resume tuning from a specific checkpoint by providing the full path to the desired checkpoint as a string. This flag is passed as an argument to the [trainer.train()](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/src/transformers/trainer.py#L1901) function of the SFTTrainer.
+
 ## Tuning Techniques:
 
 ### LoRA Tuning Example
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index 251f6d6b9..2d55b7de4 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -80,6 +80,214 @@
 PEFT_LORA_ARGS = peft_config.LoraConfig(r=8, lora_alpha=32, lora_dropout=0.05)
 
 
+def test_resume_training_from_checkpoint():
+    """
+    Test tuning resumes from the latest checkpoint, creating new checkpoints and the
+    checkpoints created before resuming tuning is not affected.
+    """
+    with tempfile.TemporaryDirectory() as tempdir:
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+
+        sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None)
+        _validate_training(tempdir)
+
+        # Get trainer state of latest checkpoint
+        init_trainer_state, _ = _get_latest_checkpoint_trainer_state(tempdir)
+        assert init_trainer_state is not None
+
+        # Resume training with higher epoch and same output dir
+        train_args.num_train_epochs += 5
+        sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None)
+        _validate_training(tempdir)
+
+        # Get trainer state of latest checkpoint
+        final_trainer_state, _ = _get_latest_checkpoint_trainer_state(tempdir)
+        assert final_trainer_state is not None
+
+        assert final_trainer_state["epoch"] == init_trainer_state["epoch"] + 5
+        assert final_trainer_state["global_step"] > init_trainer_state["global_step"]
+
+        # Check if loss of 1st epoch after first tuning is same after
+        # resuming tuning and not overwritten
+        assert len(init_trainer_state["log_history"]) > 0
+
+        init_log_history = init_trainer_state["log_history"][0]
+        assert init_log_history["epoch"] == 1
+
+        final_log_history = final_trainer_state["log_history"][0]
+        assert final_log_history["epoch"] == 1
+
+        assert init_log_history["loss"] == final_log_history["loss"]
+
+
+def test_resume_training_from_checkpoint_with_flag_true():
+    """
+    Test tuning resumes from the latest checkpoint when flag is true,
+    creating new checkpoints and the checkpoints created before resuming
+    tuning is not affected.
+    """
+    with tempfile.TemporaryDirectory() as tempdir:
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+        train_args.resume_from_checkpoint = "True"
+
+        sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None)
+        _validate_training(tempdir)
+
+        # Get trainer state of latest checkpoint
+        init_trainer_state, _ = _get_latest_checkpoint_trainer_state(tempdir)
+        assert init_trainer_state is not None
+
+        # Get Training logs
+        init_training_logs = _get_training_logs_by_epoch(tempdir)
+
+        # Resume training with higher epoch and same output dir
+        train_args.num_train_epochs += 5
+        sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None)
+        _validate_training(tempdir)
+
+        # Get trainer state of latest checkpoint
+        final_trainer_state, _ = _get_latest_checkpoint_trainer_state(tempdir)
+        assert final_trainer_state is not None
+
+        assert final_trainer_state["epoch"] == init_trainer_state["epoch"] + 5
+        assert final_trainer_state["global_step"] > init_trainer_state["global_step"]
+
+        final_training_logs = _get_training_logs_by_epoch(tempdir)
+
+        assert (
+            init_training_logs[0]["data"]["timestamp"]
+            == final_training_logs[0]["data"]["timestamp"]
+        )
+
+
+def test_resume_training_from_checkpoint_with_flag_false():
+    """
+    Test when setting resume_from_checkpoint=False that tuning will start from scratch.
+    """
+    with tempfile.TemporaryDirectory() as tempdir:
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+        train_args.resume_from_checkpoint = "False"
+
+        sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None)
+        _validate_training(tempdir)
+
+        # Get trainer state of latest checkpoint
+        init_trainer_state, _ = _get_latest_checkpoint_trainer_state(tempdir)
+        assert init_trainer_state is not None
+
+        # Get Training log entry for epoch 1
+        init_training_logs = _get_training_logs_by_epoch(tempdir, epoch=1)
+        assert len(init_training_logs) == 1
+
+        # Training again with higher epoch and same output dir
+        train_args.num_train_epochs += 5
+        sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, None)
+        _validate_training(tempdir)
+
+        # Get Training log entry for epoch 1
+        final_training_logs = _get_training_logs_by_epoch(tempdir, epoch=1)
+        assert len(final_training_logs) == 2
+
+
+def test_resume_training_from_checkpoint_with_flag_checkpoint_path_lora():
+    """
+    Test resume checkpoint from a specified checkpoint path for LoRA tuning.
+    """
+    with tempfile.TemporaryDirectory() as tempdir:
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        lora_config = copy.deepcopy(PEFT_LORA_ARGS)
+        train_args.output_dir = tempdir
+
+        sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, lora_config)
+        _validate_training(tempdir)
+
+        # Get trainer state and checkpoint_path of second last checkpoint
+        init_trainer_state, checkpoint_path = _get_latest_checkpoint_trainer_state(
+            tempdir, checkpoint_index=-2
+        )
+        assert init_trainer_state is not None
+
+        # Resume training with higher epoch and same output dir
+        train_args.num_train_epochs += 5
+        train_args.resume_from_checkpoint = checkpoint_path
+        sft_trainer.train(MODEL_ARGS, DATA_ARGS, train_args, lora_config)
+        _validate_training(tempdir)
+
+        # Get total_flos from trainer state of checkpoint_path and check if its same
+        final_trainer_state = None
+        trainer_state_file = os.path.join(checkpoint_path, "trainer_state.json")
+        with open(trainer_state_file, "r", encoding="utf-8") as f:
+            final_trainer_state = json.load(f)
+
+        assert final_trainer_state["total_flos"] == init_trainer_state["total_flos"]
+
+
+def _get_latest_checkpoint_trainer_state(dir_path: str, checkpoint_index: int = -1):
+    """
+    Get the trainer state from the latest or specified checkpoint directory.
+    The trainer state is returned along with the path to the checkpoint.
+
+    Args:
+        dir_path (str): The directory path where checkpoint folders are located.
+        checkpoint_index (int, optional): The index of the checkpoint to retrieve,
+                                          based on the checkpoint number. The default
+                                          is -1, which returns the latest checkpoint.
+
+    Returns:
+        trainer_state: The trainer state loaded from `trainer_state.json` in the
+                            checkpoint directory.
+        last_checkpoint: The path to the checkpoint directory.
+    """
+    trainer_state = None
+    last_checkpoint = None
+    checkpoints = [
+        os.path.join(dir_path, d)
+        for d in os.listdir(dir_path)
+        if d.startswith("checkpoint")
+    ]
+    if checkpoints:
+        last_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))[
+            checkpoint_index
+        ]
+        trainer_state_file = os.path.join(last_checkpoint, "trainer_state.json")
+        with open(trainer_state_file, "r", encoding="utf-8") as f:
+            trainer_state = json.load(f)
+    return trainer_state, last_checkpoint
+
+
+def _get_training_logs_by_epoch(dir_path: str, epoch: int = None):
+    """
+    Load and optionally filter training_logs.jsonl file.
+    If an epoch number is specified, the function filters the logs
+    and returns only the entries corresponding to the specified epoch.
+
+    Args:
+        dir_path (str): The directory path where the `training_logs.jsonl` file is located.
+        epoch (int, optional): The epoch number to filter logs by. If not specified,
+                               all logs are returned.
+
+    Returns:
+        list: A list containing the training logs. If `epoch` is specified,
+              only logs from the specified epoch are returned; otherwise, all logs are returned.
+    """
+    data_list = []
+    with open(f"{dir_path}/training_logs.jsonl", "r", encoding="utf-8") as file:
+        for line in file:
+            json_data = json.loads(line)
+            data_list.append(json_data)
+
+    if epoch:
+        mod_data_list = []
+        for value in data_list:
+            if value["data"]["epoch"] == epoch:
+                mod_data_list.append(value)
+        return mod_data_list
+    return data_list
+
+
 def test_run_train_requires_output_dir():
     """Check fails when output dir not provided."""
     updated_output_dir_train_args = copy.deepcopy(TRAIN_ARGS)
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 2ab8f7de0..da8fa5172 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -35,6 +35,7 @@
     LlamaTokenizerFast,
     TrainerCallback,
 )
+from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import is_accelerate_available
 from trl import SFTConfig, SFTTrainer
 import transformers
@@ -215,7 +216,7 @@ def train(
         ),
     )
 
-    # add special tokens only when a custom tokenizer is not passed
+    # Add special tokens only when a custom tokenizer is not passed
     if not model_args.tokenizer_name_or_path:
         # TODO: understand if we need to hardcode these here or just use defaults in model
         if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)):
@@ -366,7 +367,24 @@ def train(
         for x in framework.get_callbacks_and_ready_for_train(model, accelerator):
             trainer.add_callback(x)
 
-    trainer.train()
+    resume_from_checkpoint = None
+    # Check if resume flag is not passed (None), or if flag is true and
+    # output_dir has checkpoints then get last checkpoint from output_dir
+    if (
+        training_args.resume_from_checkpoint is None
+        or training_args.resume_from_checkpoint.lower() == "true"
+    ):
+        resume_from_checkpoint = get_last_checkpoint(training_args.output_dir)
+    else:
+        # `training_args.resume_from_checkpoint` gives string values
+        # Check if flag is false OR flag has checkpoint value for resuming tuning
+        resume_from_checkpoint = (
+            training_args.resume_from_checkpoint
+            if training_args.resume_from_checkpoint.lower() != "false"
+            else False
+        )
+
+    trainer.train(resume_from_checkpoint)
 
     return trainer
 

From 229e230b1ed4dfaea7a88d7002e4c4f098b5c109 Mon Sep 17 00:00:00 2001
From: Hari <harikrishmenon@gmail.com>
Date: Mon, 16 Sep 2024 21:59:33 +0530
Subject: [PATCH 16/16] ci: add a github workflow to label pull requests based
 on their title (#298)

Signed-off-by: Harikrishnan Balagopal <harikrishmenon@gmail.com>
Signed-off-by: Anh Uong <anh.uong@ibm.com>
---
 .github/workflows/labelpr.yaml | 35 ++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 .github/workflows/labelpr.yaml

diff --git a/.github/workflows/labelpr.yaml b/.github/workflows/labelpr.yaml
new file mode 100644
index 000000000..c14131cbf
--- /dev/null
+++ b/.github/workflows/labelpr.yaml
@@ -0,0 +1,35 @@
+name: Label PRs
+
+on:
+  pull_request_target:
+    types: [opened, edited, synchronize, reopened]
+
+jobs:
+  label_pr:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/github-script@v3
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const pr_welcome_msg = `Thanks for making a pull request! 😃\nOne of the maintainers will review and advise on the next steps.`;
+            // https://github.com/commitizen/conventional-commit-types
+            const valid_pr_types = ['feat', 'fix', 'docs', 'style', 'refactor', 'perf', 'test', 'build', 'ci', 'chore', 'revert'];
+
+            if(context.payload.pull_request.comments === 0) {
+              await github.issues.createComment({ ...context.repo, issue_number: context.payload.number, body: pr_welcome_msg});
+            }
+
+            const title = context.payload.pull_request.title;
+            const results = /^(\w+)(\(\w+\))?!?:/.exec(title);
+            if (results === null) return core.setFailed(`The title does not follow conventional commits spec: https://www.conventionalcommits.org/en/v1.0.0/#summary Title: ${title}`);
+
+            const pr_type = results[1];
+            core.info(`pr_type: ${pr_type}`);
+
+            if (!valid_pr_types.includes(pr_type)) return core.setFailed(`Unknown pull request type: ${pr_type}`);
+
+            const labels = context.payload.pull_request.labels;
+            const new_labels = labels.filter(label => !valid_pr_types.includes(label.name)); // keep all labels that are not in valid_pr_types
+            new_labels.push({name: pr_type});
+            await github.issues.update({ ...context.repo, issue_number: context.payload.number, labels: new_labels });
\ No newline at end of file