diff --git a/06_gpu_and_ml/flan_t5/flan_t5_finetune.py b/06_gpu_and_ml/flan_t5/flan_t5_finetune.py index 956805fa8..b0b110258 100644 --- a/06_gpu_and_ml/flan_t5/flan_t5_finetune.py +++ b/06_gpu_and_ml/flan_t5/flan_t5_finetune.py @@ -131,8 +131,9 @@ def on_save(self, args, state, control, **kwargs): """ Event called after a checkpoint save. """ - print("running commit on modal.Volume after model checkpoint") - self.volume.commit() + if state.is_world_process_zero: + print("running commit on modal.Volume after model checkpoint") + self.volume.commit() training_args = Seq2SeqTrainingArguments( # Save checkpoints to the mounted volume @@ -142,8 +143,6 @@ def on_save(self, args, state, control, **kwargs): predict_with_generate=True, learning_rate=3e-5, num_train_epochs=num_train_epochs, - # Save logs to the mounted volume - logging_dir=str(VOL_MOUNT_PATH / "logs"), logging_strategy="steps", logging_steps=100, evaluation_strategy="epoch",