Using FormattedCheckpointFiles in configs (pytorch#2147)

sarvamai · Dec 23, 2024 · a6f1cd2 · a6f1cd2
1 parent 7589cb3
commit a6f1cd2
Show file tree

Hide file tree

Showing 12 changed files with 16 additions and 74 deletions.
diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
@@ -31,7 +31,7 @@ checkpointer:
   checkpoint_dir:  /tmp/Llama-2-70b-hf
   checkpoint_files:
     filename_format: pytorch_model-{}-of-{}.bin
-    max_filename: "00015"
+    max_filename: 00015
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA2

diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
@@ -36,7 +36,7 @@ checkpointer:
   checkpoint_dir:  /tmp/Llama-2-70b-hf
   checkpoint_files:
     filename_format: pytorch_model-{}-of-{}.bin
-    max_filename: "00015"
+    max_filename: 00015
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA2

diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
@@ -41,7 +41,7 @@ checkpointer:
   checkpoint_dir: /tmp/Meta-Llama-3-70B-Instruct
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: "00030"
+    max_filename: 00030
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
@@ -31,7 +31,7 @@ checkpointer:
   checkpoint_dir:  /tmp/Meta-Llama-3-70B-Instruct
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: "00030"
+    max_filename: 00030
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
@@ -40,7 +40,7 @@ checkpointer:
   checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: "00030"
+    max_filename: 00030
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
@@ -30,7 +30,7 @@ checkpointer:
   checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: "00030"
+    max_filename: 00030
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3_3/70B_full.yaml b/recipes/configs/llama3_3/70B_full.yaml
@@ -40,7 +40,7 @@ checkpointer:
   checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: "00030"
+    max_filename: 00030
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3_3/70B_lora.yaml b/recipes/configs/llama3_3/70B_lora.yaml
@@ -28,38 +28,9 @@ tokenizer:
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/
-  checkpoint_files: [
-    model-00001-of-00030.safetensors,
-    model-00002-of-00030.safetensors,
-    model-00003-of-00030.safetensors,
-    model-00004-of-00030.safetensors,
-    model-00005-of-00030.safetensors,
-    model-00006-of-00030.safetensors,
-    model-00007-of-00030.safetensors,
-    model-00008-of-00030.safetensors,
-    model-00009-of-00030.safetensors,
-    model-00010-of-00030.safetensors,
-    model-00011-of-00030.safetensors,
-    model-00012-of-00030.safetensors,
-    model-00013-of-00030.safetensors,
-    model-00014-of-00030.safetensors,
-    model-00015-of-00030.safetensors,
-    model-00016-of-00030.safetensors,
-    model-00017-of-00030.safetensors,
-    model-00018-of-00030.safetensors,
-    model-00019-of-00030.safetensors,
-    model-00020-of-00030.safetensors,
-    model-00021-of-00030.safetensors,
-    model-00022-of-00030.safetensors,
-    model-00023-of-00030.safetensors,
-    model-00024-of-00030.safetensors,
-    model-00025-of-00030.safetensors,
-    model-00026-of-00030.safetensors,
-    model-00027-of-00030.safetensors,
-    model-00028-of-00030.safetensors,
-    model-00029-of-00030.safetensors,
-    model-00030-of-00030.safetensors,
-  ]
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: 00030
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3_3/70B_qlora.yaml b/recipes/configs/llama3_3/70B_qlora.yaml
@@ -28,38 +28,9 @@ tokenizer:
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/
-  checkpoint_files: [
-    model-00001-of-00030.safetensors,
-    model-00002-of-00030.safetensors,
-    model-00003-of-00030.safetensors,
-    model-00004-of-00030.safetensors,
-    model-00005-of-00030.safetensors,
-    model-00006-of-00030.safetensors,
-    model-00007-of-00030.safetensors,
-    model-00008-of-00030.safetensors,
-    model-00009-of-00030.safetensors,
-    model-00010-of-00030.safetensors,
-    model-00011-of-00030.safetensors,
-    model-00012-of-00030.safetensors,
-    model-00013-of-00030.safetensors,
-    model-00014-of-00030.safetensors,
-    model-00015-of-00030.safetensors,
-    model-00016-of-00030.safetensors,
-    model-00017-of-00030.safetensors,
-    model-00018-of-00030.safetensors,
-    model-00019-of-00030.safetensors,
-    model-00020-of-00030.safetensors,
-    model-00021-of-00030.safetensors,
-    model-00022-of-00030.safetensors,
-    model-00023-of-00030.safetensors,
-    model-00024-of-00030.safetensors,
-    model-00025-of-00030.safetensors,
-    model-00026-of-00030.safetensors,
-    model-00027-of-00030.safetensors,
-    model-00028-of-00030.safetensors,
-    model-00029-of-00030.safetensors,
-    model-00030-of-00030.safetensors,
-  ]
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: 00030
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/qwen2_5/14B_lora_single_device.yaml b/recipes/configs/qwen2_5/14B_lora_single_device.yaml
@@ -39,7 +39,7 @@ checkpointer:
   checkpoint_dir: /tmp/Qwen2_5-14B-Instruct
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: "00008"
+    max_filename: 00008
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: QWEN2

diff --git a/recipes/configs/qwen2_5/32B_lora.yaml b/recipes/configs/qwen2_5/32B_lora.yaml
@@ -37,7 +37,7 @@ checkpointer:
   checkpoint_dir: /tmp/Qwen2_5-32B-Instruct
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: "00017"
+    max_filename: 00017
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: QWEN2

diff --git a/recipes/configs/qwen2_5/72B_lora.yaml b/recipes/configs/qwen2_5/72B_lora.yaml
@@ -37,7 +37,7 @@ checkpointer:
   checkpoint_dir: /tmp/Qwen2_5-72B-Instruct
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: "00037"
+    max_filename: 00037
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: QWEN2