[torchtune][dcp] Do not expose checkpoint_client as a public API yet

pytorch · Dec 13, 2024 · 21b6ded · 21b6ded
1 parent e745717
commit 21b6ded
Show file tree

Hide file tree

Showing 5 changed files with 5 additions and 17 deletions.
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -24,13 +24,12 @@
 from torchtune.data import padded_collate_packed
 from torchtune.datasets import ConcatDataset
 from torchtune.recipe_interfaces import FTRecipeInterface
-from torchtune.training import (
+from torchtune.training import DummyProfiler, PROFILER_KEY
+from torchtune.training.activations import apply_selective_activation_checkpointing
+from torchtune.training.checkpointing._checkpoint_client import (
     CheckpointClient,
-    DummyProfiler,
-    PROFILER_KEY,
     TrainingProgress,
 )
-from torchtune.training.activations import apply_selective_activation_checkpointing
 from torchtune.training.lr_schedulers import get_lr
 
 from tqdm import tqdm

diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
@@ -213,7 +213,7 @@ def load_checkpoint(self, cfg_checkpointer: DictConfig) -> Dict[str, Any]:
         """
         self._checkpointer = config.instantiate(
             cfg_checkpointer,
-            resume_from_checkpoint=self._resume_from_checkpoint,
+            should_load_recipe_state=self._resume_from_checkpoint,
         )
         checkpoint_dict = self._checkpointer.load_checkpoint()
 

diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py
@@ -36,7 +36,6 @@
 from torchtune.training.checkpointing import (
     ADAPTER_CONFIG,
     ADAPTER_KEY,
-    CheckpointClient,
     Checkpointer,
     DistributedCheckpointer,
     EPOCHS_KEY,
@@ -52,7 +51,6 @@
     SEED_KEY,
     STEPS_KEY,
     TOTAL_EPOCHS_KEY,
-    TrainingProgress,
     update_state_dict_for_classifier,
 )
 from torchtune.training.lr_schedulers import get_cosine_schedule_with_warmup, get_lr
@@ -80,7 +78,6 @@
     "get_dtype",
     "set_default_dtype",
     "validate_expected_param_dtype",
-    "CheckpointClient",
     "FullModelHFCheckpointer",
     "FullModelMetaCheckpointer",
     "DistributedCheckpointer",
@@ -134,5 +131,4 @@
     "OffloadActivations",
     "FormattedCheckpointFiles",
     "scale_grads",
-    "TrainingProgress",
 ]
diff --git a/torchtune/training/checkpointing/__init__.py b/torchtune/training/checkpointing/__init__.py
@@ -5,11 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 from typing import Union
 
-from torchtune.training.checkpointing._checkpoint_client import (
-    CheckpointClient,
-    TrainingProgress,
-)
-
 from torchtune.training.checkpointing._checkpointer import (
     DistributedCheckpointer,
     FullModelHFCheckpointer,
@@ -41,7 +36,6 @@
 ]
 
 __all__ = [
-    "CheckpointClient",
     "FullModelHFCheckpointer",
     "FullModelMetaCheckpointer",
     "FullModelTorchTuneCheckpointer",
@@ -61,5 +55,4 @@
     "STEPS_KEY",
     "TOTAL_EPOCHS_KEY",
     "FormattedCheckpointFiles",
-    "TrainingProgress",
 ]
diff --git a/torchtune/training/checkpointing/_checkpoint_client.py b/torchtune/training/checkpointing/_checkpoint_client.py
@@ -298,7 +298,7 @@ def load_distributed_checkpoint(
     ) -> Dict[str, Any]:
         """
         This method is used to resume training from a distributed checkpoint state.
-        Due to being disributed, this mehod is called on every rank.
+        Due to being distributed, this method is called on every rank.
         """
         if self._is_rank_zero:
             dcp_load_start = time.perf_counter()