From 3ac941d9d84bb75870788734b0ea97ce5e2f3b07 Mon Sep 17 00:00:00 2001
From: Kelly A <kellyaa@users.noreply.github.com>
Date: Fri, 8 Mar 2024 13:16:50 -0500
Subject: [PATCH 1/3] Allow SFT_TRAINER_CONFIG_JSON_ENV_VAR to be encoded json
 string (#82)

* allow SFT_TRAINER_CONFIG_JSON_ENV_VAR to be encoded json string, not just pickled python

Signed-off-by: Kelly A <kellyaa@users.noreply.github.com>

* linting fix

Signed-off-by: Kelly A <kellyaa@users.noreply.github.com>

* fix error catching

Signed-off-by: Kelly A <kellyaa@users.noreply.github.com>

---------

Signed-off-by: Kelly A <kellyaa@users.noreply.github.com>
---
 build/launch_training.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/build/launch_training.py b/build/launch_training.py
index 57ede1f6b..5592d5888 100644
--- a/build/launch_training.py
+++ b/build/launch_training.py
@@ -39,8 +39,12 @@
 def txt_to_obj(txt):
     base64_bytes = txt.encode("ascii")
     message_bytes = base64.b64decode(base64_bytes)
-    obj = pickle.loads(message_bytes)
-    return obj
+    try:
+        # If the bytes represent JSON string
+        return json.loads(message_bytes)
+    except UnicodeDecodeError:
+        # Otherwise the bytes are a pickled python dictionary
+        return pickle.loads(message_bytes)
 
 
 def get_highest_checkpoint(dir_path):

From 3f83a3ddba4a4f983347f67dbf1474f292fd452d Mon Sep 17 00:00:00 2001
From: ted chang <htchang@us.ibm.com>
Date: Fri, 8 Mar 2024 11:52:50 -0800
Subject: [PATCH 2/3] Document lint (#84)

* document lint

Signed-off-by: ted chang <htchang@us.ibm.com>

* add drop down

Signed-off-by: ted chang <htchang@us.ibm.com>

---------

Signed-off-by: ted chang <htchang@us.ibm.com>
Co-authored-by: Sukriti Sharma <Ssukriti@users.noreply.github.com>
---
 CONTRIBUTING.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4ec6a2273..375096ed7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -80,10 +80,58 @@ pip install -r requirements.txt
 pip install -U datasets
 pip install -e .
 ```
+<details>
+<summary>Linting</summary>
 
-### Unit tests
+To lint your code:
+```shell
+tox -e lint
+```
+
+We use Pylint to checks your Python code for errors, coding standards, code convention and refactoring suggestions.
+
+Pylint emits [messages](https://pylint.pycqa.org/en/latest/user_guide/messages/index.html) that provides explanations of the failed checks.
+
+You should fix all message in the following order:
+1. Fix each message provided. Select a message [description](https://pylint.pycqa.org/en/latest/user_guide/messages/messages_overview.html#messages-overview) to fix a message.
+2. Disable a message (i.e: unbalanced-tuple-unpacking) caused by a particular line of code:
+    ```python
+    a, b = ... # pylint: disable=unbalanced-tuple-unpacking
+    ```
+    Please see [here](https://pylint.pycqa.org/en/latest/user_guide/messages/message_control.html#block-disables) for the progma syntax.
+
+3. Disable a checker globally. Please extend the `disable=` list in the [pylintrc](.pylintrc) file.
+    > Note: Disable checkers only if there is good reason.
+</details>
+
+<details>
+<summary>Formatting</summary>
+
+To format your code:
+```shell
+tox -e fmt
+```
+We use [black](https://github.com/psf/black) formatter to format the code.
+
+You could optionally install the git pre-commit hooks if you would like to format the code automatically for each commit:
+```
+brew install pre-commit
+pre-commit install
+```
+</details>
+
+<details>
+<summary>Unit tests</summary>
+
+To run unit tests:
+```shell
+tox -e py
+```
+Running unit tests ensures your contributions do not break exiting code.
+We use [pytest](https://docs.pytest.org/) framework to run unit tests. The framework is setup to run all run all test_*.py or *_test.py in the [tests](./tests) directory.
 
-Work in process, to be completed soon. 
+> Optionally, run `make test` command to do formatting, linting, and testing at once.
+</details>
 
 ## Your First Code Contribution
 

From 07298204148ef5014c57a8dbe6fbe5d691e2b79c Mon Sep 17 00:00:00 2001
From: Yu Chin Fabian Lim <fabianlim@users.noreply.github.com>
Date: Sat, 9 Mar 2024 08:43:40 +0800
Subject: [PATCH 3/3] Let Huggingface Properly Initialize Arguments, and Fix
 FSDP-LORA Checkpoint-Saves and Resumption (#53)

* training args should call post init to intialize all HF flags

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>

* remove run_distribtued flag and peft_saving callback

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>

* revert deletion of validation checks on some train args

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>

* revert the addition of __post_init__ as it is actually not needed

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>

---------

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
Co-authored-by: Sukriti Sharma <Ssukriti@users.noreply.github.com>
---
 tuning/sft_trainer.py | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 59943a750..29c5fd299 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -44,17 +44,6 @@
 from tuning.utils.data_type_utils import get_torch_dtype
 
 
-class PeftSavingCallback(TrainerCallback):
-    def on_save(self, args, state, control, **kwargs):
-        checkpoint_path = os.path.join(
-            args.output_dir, f"checkpoint-{state.global_step}"
-        )
-        kwargs["model"].save_pretrained(checkpoint_path)
-
-        if "pytorch_model.bin" in os.listdir(checkpoint_path):
-            os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))
-
-
 class FileLoggingCallback(TrainerCallback):
     """Exports metrics, e.g., training loss to a file in the checkpoint directory."""
 
@@ -118,7 +107,6 @@ def train(
         None for fine tuning
             The peft configuration to pass to trainer
     """
-    run_distributed = int(os.environ.get("WORLD_SIZE", "1")) > 1
 
     logger = logging.get_logger("sft_trainer")
 
@@ -132,11 +120,6 @@ def train(
     ):
         raise ValueError("gradient_accumulation_steps has to be an integer >= 1")
 
-    # make sure to unset FSDP args when running on single gpu
-    if not run_distributed:
-        train_args.fsdp = ""
-        train_args.fsdp_config = {"xla": False}
-
     task_type = "CAUSAL_LM"
     model = AutoModelForCausalLM.from_pretrained(
         model_args.model_name_or_path,
@@ -147,8 +130,6 @@ def train(
 
     peft_config = get_hf_peft_config(task_type, peft_config)
 
-    model.gradient_checkpointing_enable()
-
     # TODO: Move these to a config as well
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path, cache_dir=train_args.cache_dir, use_fast=True
@@ -239,8 +220,7 @@ def train(
 
     aim_callback = get_aimstack_callback()
     file_logger_callback = FileLoggingCallback(logger)
-    peft_saving_callback = PeftSavingCallback()
-    callbacks = [aim_callback, peft_saving_callback, file_logger_callback]
+    callbacks = [aim_callback, file_logger_callback]
 
     if train_args.packing:
         logger.info("Packing is set to True")
@@ -281,7 +261,7 @@ def train(
         peft_config=peft_config,
     )
 
-    if run_distributed and peft_config is not None:
+    if trainer.is_fsdp_enabled and peft_config is not None:
         trainer.accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(
             model
         )