diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml
index ddeab81957..e24fc56219 100644
--- a/recipes/configs/llama3_2/1B_full_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_full_single_device.yaml
@@ -19,9 +19,12 @@
 #
 # This config works only for training on single device.
 
-
 output_dir: /tmp/torchtune/llama3_2_1B/full_single_device # /tmp may be deleted by your system. Change it to your preference.
 
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3_2.llama3_2_1b
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
index 298b0b113c..a863c905be 100644
--- a/torchtune/training/checkpointing/_checkpointer.py
+++ b/torchtune/training/checkpointing/_checkpointer.py
@@ -15,10 +15,7 @@
 
 from torchtune import training
 from torchtune.models import convert_weights
-from torchtune.models.clip._convert_weights import clip_text_hf_to_tune
-from torchtune.models.phi3._convert_weights import phi3_hf_to_tune, phi3_tune_to_hf
-from torchtune.models.qwen2._convert_weights import qwen2_hf_to_tune, qwen2_tune_to_hf
-from torchtune.rlhf.utils import reward_hf_to_tune, reward_tune_to_hf
+
 from torchtune.training.checkpointing._utils import (
     ADAPTER_CONFIG_FNAME,
     ADAPTER_MODEL_FNAME,
@@ -159,7 +156,7 @@ def __init__(
         self._resume_from_checkpoint = resume_from_checkpoint
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
-        self._output_dir.mkdir(exist_ok=True)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
 
         # save all files in input_dir, except model weights and mapping, to output_dir
         # this is useful to preserve the tokenizer, configs, license, etc.
@@ -394,7 +391,7 @@ def __init__(
         self._checkpoint_dir = Path(checkpoint_dir)
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
-        self._output_dir.mkdir(exist_ok=True)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
 
         # weight_map contains the state_dict key -> checkpoint file mapping so we can correctly
         # parition the state dict into output checkpoint files. This is updated during checkpoint
@@ -509,10 +506,14 @@ def load_checkpoint(self) -> Dict[str, Any]:
                 msg="Converting Phi-3 Mini weights from HF format."
                 "Note that conversion of adapter weights into PEFT format is not supported.",
             )
+            from torchtune.models.phi3._convert_weights import phi3_hf_to_tune
+
             converted_state_dict[training.MODEL_KEY] = phi3_hf_to_tune(
                 merged_state_dict
             )
         elif self._model_type == ModelType.REWARD:
+            from torchtune.rlhf.utils import reward_hf_to_tune
+
             converted_state_dict[training.MODEL_KEY] = reward_hf_to_tune(
                 merged_state_dict,
                 num_heads=self._config["num_attention_heads"],
@@ -520,6 +521,8 @@ def load_checkpoint(self) -> Dict[str, Any]:
                 dim=self._config["hidden_size"],
             )
         elif self._model_type == ModelType.QWEN2:
+            from torchtune.models.qwen2._convert_weights import qwen2_hf_to_tune
+
             converted_state_dict[training.MODEL_KEY] = qwen2_hf_to_tune(
                 merged_state_dict,
                 num_heads=self._config["num_attention_heads"],
@@ -550,6 +553,8 @@ def load_checkpoint(self) -> Dict[str, Any]:
                 ),
             )
         elif self._model_type == ModelType.CLIP_TEXT:
+            from torchtune.models.clip._convert_weights import clip_text_hf_to_tune
+
             converted_state_dict[training.MODEL_KEY] = clip_text_hf_to_tune(
                 merged_state_dict,
             )
@@ -610,10 +615,14 @@ def save_checkpoint(
         # convert the state_dict back to hf format; do this inplace
         if not adapter_only:
             if self._model_type == ModelType.PHI3_MINI:
+                from torchtune.models.phi3._convert_weights import phi3_tune_to_hf
+
                 state_dict[training.MODEL_KEY] = phi3_tune_to_hf(
                     state_dict[training.MODEL_KEY]
                 )
             elif self._model_type == ModelType.REWARD:
+                from torchtune.rlhf.utils import reward_tune_to_hf
+
                 state_dict[training.MODEL_KEY] = reward_tune_to_hf(
                     state_dict[training.MODEL_KEY],
                     num_heads=self._config["num_attention_heads"],
@@ -621,6 +630,8 @@ def save_checkpoint(
                     dim=self._config["hidden_size"],
                 )
             elif self._model_type == ModelType.QWEN2:
+                from torchtune.models.qwen2._convert_weights import qwen2_tune_to_hf
+
                 state_dict[training.MODEL_KEY] = qwen2_tune_to_hf(
                     state_dict[training.MODEL_KEY],
                     num_heads=self._config["num_attention_heads"],
@@ -913,7 +924,7 @@ def __init__(
         self._resume_from_checkpoint = resume_from_checkpoint
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
-        self._output_dir.mkdir(exist_ok=True)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
 
         # save all files in input_dir, except model weights and mapping, to output_dir
         # this is useful to preserve the tokenizer, configs, license, etc.