diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml index ddeab81957..e24fc56219 100644 --- a/recipes/configs/llama3_2/1B_full_single_device.yaml +++ b/recipes/configs/llama3_2/1B_full_single_device.yaml @@ -19,9 +19,12 @@ # # This config works only for training on single device. - output_dir: /tmp/torchtune/llama3_2_1B/full_single_device # /tmp may be deleted by your system. Change it to your preference. +# Model Arguments +model: + _component_: torchtune.models.llama3_2.llama3_2_1b + # Tokenizer tokenizer: _component_: torchtune.models.llama3.llama3_tokenizer diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py index 298b0b113c..a863c905be 100644 --- a/torchtune/training/checkpointing/_checkpointer.py +++ b/torchtune/training/checkpointing/_checkpointer.py @@ -15,10 +15,7 @@ from torchtune import training from torchtune.models import convert_weights -from torchtune.models.clip._convert_weights import clip_text_hf_to_tune -from torchtune.models.phi3._convert_weights import phi3_hf_to_tune, phi3_tune_to_hf -from torchtune.models.qwen2._convert_weights import qwen2_hf_to_tune, qwen2_tune_to_hf -from torchtune.rlhf.utils import reward_hf_to_tune, reward_tune_to_hf + from torchtune.training.checkpointing._utils import ( ADAPTER_CONFIG_FNAME, ADAPTER_MODEL_FNAME, @@ -159,7 +156,7 @@ def __init__( self._resume_from_checkpoint = resume_from_checkpoint self._model_type = ModelType[model_type] self._output_dir = Path(output_dir) - self._output_dir.mkdir(exist_ok=True) + self._output_dir.mkdir(parents=True, exist_ok=True) # save all files in input_dir, except model weights and mapping, to output_dir # this is useful to preserve the tokenizer, configs, license, etc. @@ -394,7 +391,7 @@ def __init__( self._checkpoint_dir = Path(checkpoint_dir) self._model_type = ModelType[model_type] self._output_dir = Path(output_dir) - self._output_dir.mkdir(exist_ok=True) + self._output_dir.mkdir(parents=True, exist_ok=True) # weight_map contains the state_dict key -> checkpoint file mapping so we can correctly # parition the state dict into output checkpoint files. This is updated during checkpoint @@ -509,10 +506,14 @@ def load_checkpoint(self) -> Dict[str, Any]: msg="Converting Phi-3 Mini weights from HF format." "Note that conversion of adapter weights into PEFT format is not supported.", ) + from torchtune.models.phi3._convert_weights import phi3_hf_to_tune + converted_state_dict[training.MODEL_KEY] = phi3_hf_to_tune( merged_state_dict ) elif self._model_type == ModelType.REWARD: + from torchtune.rlhf.utils import reward_hf_to_tune + converted_state_dict[training.MODEL_KEY] = reward_hf_to_tune( merged_state_dict, num_heads=self._config["num_attention_heads"], @@ -520,6 +521,8 @@ def load_checkpoint(self) -> Dict[str, Any]: dim=self._config["hidden_size"], ) elif self._model_type == ModelType.QWEN2: + from torchtune.models.qwen2._convert_weights import qwen2_hf_to_tune + converted_state_dict[training.MODEL_KEY] = qwen2_hf_to_tune( merged_state_dict, num_heads=self._config["num_attention_heads"], @@ -550,6 +553,8 @@ def load_checkpoint(self) -> Dict[str, Any]: ), ) elif self._model_type == ModelType.CLIP_TEXT: + from torchtune.models.clip._convert_weights import clip_text_hf_to_tune + converted_state_dict[training.MODEL_KEY] = clip_text_hf_to_tune( merged_state_dict, ) @@ -610,10 +615,14 @@ def save_checkpoint( # convert the state_dict back to hf format; do this inplace if not adapter_only: if self._model_type == ModelType.PHI3_MINI: + from torchtune.models.phi3._convert_weights import phi3_tune_to_hf + state_dict[training.MODEL_KEY] = phi3_tune_to_hf( state_dict[training.MODEL_KEY] ) elif self._model_type == ModelType.REWARD: + from torchtune.rlhf.utils import reward_tune_to_hf + state_dict[training.MODEL_KEY] = reward_tune_to_hf( state_dict[training.MODEL_KEY], num_heads=self._config["num_attention_heads"], @@ -621,6 +630,8 @@ def save_checkpoint( dim=self._config["hidden_size"], ) elif self._model_type == ModelType.QWEN2: + from torchtune.models.qwen2._convert_weights import qwen2_tune_to_hf + state_dict[training.MODEL_KEY] = qwen2_tune_to_hf( state_dict[training.MODEL_KEY], num_heads=self._config["num_attention_heads"], @@ -913,7 +924,7 @@ def __init__( self._resume_from_checkpoint = resume_from_checkpoint self._model_type = ModelType[model_type] self._output_dir = Path(output_dir) - self._output_dir.mkdir(exist_ok=True) + self._output_dir.mkdir(parents=True, exist_ok=True) # save all files in input_dir, except model weights and mapping, to output_dir # this is useful to preserve the tokenizer, configs, license, etc.