fix: Modified test fn descripts, added readme

Signed-off-by: Abhishek <[email protected]>
foundation-model-stack · Sep 12, 2024 · 854aec2 · 854aec2
1 parent e06b2d9
commit 854aec2
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 13 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -333,7 +333,7 @@ indent-string='    '
 max-line-length=100
 
 # Maximum number of lines in a module.
-max-module-lines=1100
+max-module-lines=1200
 
 # Allow the body of a class to be on the same line as the declaration if body
 # contains single statement.

diff --git a/README.md b/README.md
@@ -277,6 +277,11 @@ You can set `output_dir` to a local directory and set `save_model_dir` to COS to
 
 In order to achieve the fastest train time, set `save_strategy="no"`, as saving no checkpoints except for the final model will remove intermediate write operations all together.
 
+#### Resuming tuning from checkpoints
+If the output directory already contains checkpoints, tuning will automatically resume from the latest checkpoint in the directory specified by the `output_dir` flag. To start tuning from scratch and ignore existing checkpoints, set the `resume_from_checkpoint` flag to False.
+
+You can also use the resume_from_checkpoint flag to resume tuning from a specific checkpoint by providing the full path to the desired checkpoint as a string. This flag is passed as an argument to the [trainer.train()](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/src/transformers/trainer.py#L1901) function of the SFTTrainer.
+
 ## Tuning Techniques:
 
 ### LoRA Tuning Example

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -122,6 +122,11 @@ def test_resume_training_from_checkpoint():
 
 
 def test_resume_training_from_checkpoint_with_flag_true():
+    """
+    Test tuning resumes from the latest checkpoint when flag is true,
+    creating new checkpoints and the checkpoints created before resuming
+    tuning is not affected.
+    """
     with tempfile.TemporaryDirectory() as tempdir:
         train_args = copy.deepcopy(TRAIN_ARGS)
         train_args.output_dir = tempdir
@@ -187,10 +192,9 @@ def test_resume_training_from_checkpoint_with_flag_false():
         assert len(final_training_logs) == 2
 
 
-def test_resume_training_from_checkpoint_with_flag_checkpoint_path():
+def test_resume_training_from_checkpoint_with_flag_checkpoint_path_lora():
     """
-    Test when setting resume_from_checkpoint=path/to/checkpoint-x
-    that the tuning will resume from the checkpoint-x.
+    Test resume checkpoint from a specified checkpoint path for LoRA tuning.
     """
     with tempfile.TemporaryDirectory() as tempdir:
         train_args = copy.deepcopy(TRAIN_ARGS)
@@ -223,11 +227,8 @@ def test_resume_training_from_checkpoint_with_flag_checkpoint_path():
 
 def _get_latest_checkpoint_trainer_state(dir_path: str, checkpoint_index: int = -1):
     """
-    Get the trainer state from the specified checkpoint directory.
-    This function gets the latest or specific checkpoint based on the
-    provided checkpoint_index from the checkpoint directory, and loads
-    the `trainer_state.json` file from that checkpoint. The trainer
-    state is returned along with the path to the checkpoint.
+    Get the trainer state from the latest or specified checkpoint directory.
+    The trainer state is returned along with the path to the checkpoint.
 
     Args:
         dir_path (str): The directory path where checkpoint folders are located.
@@ -259,10 +260,9 @@ def _get_latest_checkpoint_trainer_state(dir_path: str, checkpoint_index: int =
 
 def _get_training_logs_by_epoch(dir_path: str, epoch: int = None):
     """
-    Load and optionally filter training logs from a training_logs JSON Lines file.
-    This function reads a JSON Lines (`.jsonl`) file containing training logs and
-    returns the data as a list. If an epoch number is specified, the function filters
-    the logs and returns only the entries corresponding to the specified epoch.
+    Load and optionally filter training_logs.jsonl file.
+    If an epoch number is specified, the function filters the logs
+    and returns only the entries corresponding to the specified epoch.
 
     Args:
         dir_path (str): The directory path where the `training_logs.jsonl` file is located.