pytorch · joecummings · Nov 11, 2024 · Nov 11, 2024
diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 
 optimizer:
   _component_: torch.optim.AdamW

diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -46,7 +46,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True

diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -64,7 +64,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory

diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -63,7 +63,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory

diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -63,7 +63,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory

diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
@@ -66,7 +66,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory

diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -65,7 +65,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory

diff --git a/recipes/configs/gemma2/27B_lora.yaml b/recipes/configs/gemma2/27B_lora.yaml
@@ -63,7 +63,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 compile: False  # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/gemma2/27B_qlora_single_device.yaml b/recipes/configs/gemma2/27B_qlora_single_device.yaml
@@ -62,7 +62,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
 compile: False  # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/gemma2/2B_full.yaml b/recipes/configs/gemma2/2B_full.yaml
@@ -47,7 +47,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True

diff --git a/recipes/configs/gemma2/2B_lora.yaml b/recipes/configs/gemma2/2B_lora.yaml
@@ -65,7 +65,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 compile: False  # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/gemma2/2B_lora_single_device.yaml b/recipes/configs/gemma2/2B_lora_single_device.yaml
@@ -64,7 +64,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 8
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 2
 compile: False  # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/gemma2/2B_qlora_single_device.yaml b/recipes/configs/gemma2/2B_qlora_single_device.yaml
@@ -64,7 +64,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
 compile: False  # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/gemma2/9B_lora.yaml b/recipes/configs/gemma2/9B_lora.yaml
@@ -63,7 +63,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 compile: False  # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/gemma2/9B_qlora_single_device.yaml b/recipes/configs/gemma2/9B_qlora_single_device.yaml
@@ -62,7 +62,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
 compile: False  # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
@@ -50,7 +50,7 @@ shuffle: True
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True

diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
@@ -49,7 +49,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True

diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml
@@ -51,7 +51,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 1e-5

diff --git a/recipes/configs/llama2/7B_qat_full.yaml b/recipes/configs/llama2/7B_qat_full.yaml
@@ -45,7 +45,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True

diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
@@ -76,7 +76,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 
 optimizer:
   _component_: torch.optim.AdamW

diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 
 optimizer:
   _component_: torch.optim.AdamW

diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml
@@ -50,7 +50,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW8bit
   lr: 1e-5

diff --git a/recipes/configs/llama3/8B_qat_full.yaml b/recipes/configs/llama3/8B_qat_full.yaml
@@ -43,7 +43,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 
 # QAT arguments
 quantizer:

diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
@@ -75,7 +75,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 
 optimizer:
   _component_: torch.optim.AdamW

diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml
@@ -51,7 +51,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 
 optimizer:
   _component_: torch.optim.AdamW

diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml
@@ -53,7 +53,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW8bit
   lr: 2e-5

diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 
 optimizer:
   _component_: torch.optim.AdamW

diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml
@@ -50,7 +50,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW8bit
   lr: 2e-5

diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml
@@ -49,7 +49,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 
 optimizer:
   _component_: torch.optim.AdamW

diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml
@@ -51,7 +51,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW8bit
   lr: 2e-5

diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml
@@ -52,7 +52,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True

diff --git a/recipes/configs/mistral/7B_full_low_memory.yaml b/recipes/configs/mistral/7B_full_low_memory.yaml
@@ -54,7 +54,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 5e-6

diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
@@ -72,7 +72,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory

diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml
@@ -69,7 +69,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory

diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml
@@ -70,7 +70,7 @@ loss:
 
 # Fine-tuning arguments
 batch_size: 4
-epochs: 3
+epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory

diff --git a/recipes/configs/qwen2/1.5B_full.yaml b/recipes/configs/qwen2/1.5B_full.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True

diff --git a/recipes/configs/qwen2_5/1_5B_full.yaml b/recipes/configs/qwen2_5/1_5B_full.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True