From 3ac941d9d84bb75870788734b0ea97ce5e2f3b07 Mon Sep 17 00:00:00 2001 From: Kelly A Date: Fri, 8 Mar 2024 13:16:50 -0500 Subject: [PATCH 1/3] Allow SFT_TRAINER_CONFIG_JSON_ENV_VAR to be encoded json string (#82) * allow SFT_TRAINER_CONFIG_JSON_ENV_VAR to be encoded json string, not just pickled python Signed-off-by: Kelly A * linting fix Signed-off-by: Kelly A * fix error catching Signed-off-by: Kelly A --------- Signed-off-by: Kelly A --- build/launch_training.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/build/launch_training.py b/build/launch_training.py index 57ede1f6b..5592d5888 100644 --- a/build/launch_training.py +++ b/build/launch_training.py @@ -39,8 +39,12 @@ def txt_to_obj(txt): base64_bytes = txt.encode("ascii") message_bytes = base64.b64decode(base64_bytes) - obj = pickle.loads(message_bytes) - return obj + try: + # If the bytes represent JSON string + return json.loads(message_bytes) + except UnicodeDecodeError: + # Otherwise the bytes are a pickled python dictionary + return pickle.loads(message_bytes) def get_highest_checkpoint(dir_path): From 3f83a3ddba4a4f983347f67dbf1474f292fd452d Mon Sep 17 00:00:00 2001 From: ted chang Date: Fri, 8 Mar 2024 11:52:50 -0800 Subject: [PATCH 2/3] Document lint (#84) * document lint Signed-off-by: ted chang * add drop down Signed-off-by: ted chang --------- Signed-off-by: ted chang Co-authored-by: Sukriti Sharma --- CONTRIBUTING.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4ec6a2273..375096ed7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -80,10 +80,58 @@ pip install -r requirements.txt pip install -U datasets pip install -e . ``` +
+Linting -### Unit tests +To lint your code: +```shell +tox -e lint +``` + +We use Pylint to checks your Python code for errors, coding standards, code convention and refactoring suggestions. + +Pylint emits [messages](https://pylint.pycqa.org/en/latest/user_guide/messages/index.html) that provides explanations of the failed checks. + +You should fix all message in the following order: +1. Fix each message provided. Select a message [description](https://pylint.pycqa.org/en/latest/user_guide/messages/messages_overview.html#messages-overview) to fix a message. +2. Disable a message (i.e: unbalanced-tuple-unpacking) caused by a particular line of code: + ```python + a, b = ... # pylint: disable=unbalanced-tuple-unpacking + ``` + Please see [here](https://pylint.pycqa.org/en/latest/user_guide/messages/message_control.html#block-disables) for the progma syntax. + +3. Disable a checker globally. Please extend the `disable=` list in the [pylintrc](.pylintrc) file. + > Note: Disable checkers only if there is good reason. +
+ +
+Formatting + +To format your code: +```shell +tox -e fmt +``` +We use [black](https://github.com/psf/black) formatter to format the code. + +You could optionally install the git pre-commit hooks if you would like to format the code automatically for each commit: +``` +brew install pre-commit +pre-commit install +``` +
+ +
+Unit tests + +To run unit tests: +```shell +tox -e py +``` +Running unit tests ensures your contributions do not break exiting code. +We use [pytest](https://docs.pytest.org/) framework to run unit tests. The framework is setup to run all run all test_*.py or *_test.py in the [tests](./tests) directory. -Work in process, to be completed soon. +> Optionally, run `make test` command to do formatting, linting, and testing at once. +
## Your First Code Contribution From 07298204148ef5014c57a8dbe6fbe5d691e2b79c Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Sat, 9 Mar 2024 08:43:40 +0800 Subject: [PATCH 3/3] Let Huggingface Properly Initialize Arguments, and Fix FSDP-LORA Checkpoint-Saves and Resumption (#53) * training args should call post init to intialize all HF flags Signed-off-by: Yu Chin Fabian Lim * remove run_distribtued flag and peft_saving callback Signed-off-by: Yu Chin Fabian Lim * revert deletion of validation checks on some train args Signed-off-by: Yu Chin Fabian Lim * revert the addition of __post_init__ as it is actually not needed Signed-off-by: Yu Chin Fabian Lim --------- Signed-off-by: Yu Chin Fabian Lim Co-authored-by: Yu Chin Fabian Lim Co-authored-by: Sukriti Sharma --- tuning/sft_trainer.py | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 59943a750..29c5fd299 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -44,17 +44,6 @@ from tuning.utils.data_type_utils import get_torch_dtype -class PeftSavingCallback(TrainerCallback): - def on_save(self, args, state, control, **kwargs): - checkpoint_path = os.path.join( - args.output_dir, f"checkpoint-{state.global_step}" - ) - kwargs["model"].save_pretrained(checkpoint_path) - - if "pytorch_model.bin" in os.listdir(checkpoint_path): - os.remove(os.path.join(checkpoint_path, "pytorch_model.bin")) - - class FileLoggingCallback(TrainerCallback): """Exports metrics, e.g., training loss to a file in the checkpoint directory.""" @@ -118,7 +107,6 @@ def train( None for fine tuning The peft configuration to pass to trainer """ - run_distributed = int(os.environ.get("WORLD_SIZE", "1")) > 1 logger = logging.get_logger("sft_trainer") @@ -132,11 +120,6 @@ def train( ): raise ValueError("gradient_accumulation_steps has to be an integer >= 1") - # make sure to unset FSDP args when running on single gpu - if not run_distributed: - train_args.fsdp = "" - train_args.fsdp_config = {"xla": False} - task_type = "CAUSAL_LM" model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, @@ -147,8 +130,6 @@ def train( peft_config = get_hf_peft_config(task_type, peft_config) - model.gradient_checkpointing_enable() - # TODO: Move these to a config as well tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=train_args.cache_dir, use_fast=True @@ -239,8 +220,7 @@ def train( aim_callback = get_aimstack_callback() file_logger_callback = FileLoggingCallback(logger) - peft_saving_callback = PeftSavingCallback() - callbacks = [aim_callback, peft_saving_callback, file_logger_callback] + callbacks = [aim_callback, file_logger_callback] if train_args.packing: logger.info("Packing is set to True") @@ -281,7 +261,7 @@ def train( peft_config=peft_config, ) - if run_distributed and peft_config is not None: + if trainer.is_fsdp_enabled and peft_config is not None: trainer.accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy( model )