diff --git a/06_gpu_and_ml/dreambooth/dreambooth_app.py b/06_gpu_and_ml/dreambooth/dreambooth_app.py index 21661ddad..9f8cec5af 100644 --- a/06_gpu_and_ml/dreambooth/dreambooth_app.py +++ b/06_gpu_and_ml/dreambooth/dreambooth_app.py @@ -282,6 +282,7 @@ class Model: def __enter__(self): import torch from diffusers import DDIMScheduler, StableDiffusionPipeline + # Reload the modal.Volume to ensure the latest state is accessible. stub.app.volume.reload() diff --git a/06_gpu_and_ml/flan_t5/flan_t5_finetune.py b/06_gpu_and_ml/flan_t5/flan_t5_finetune.py index f3f434dbb..956805fa8 100644 --- a/06_gpu_and_ml/flan_t5/flan_t5_finetune.py +++ b/06_gpu_and_ml/flan_t5/flan_t5_finetune.py @@ -21,7 +21,7 @@ from pathlib import Path -from modal import Image, NetworkFileSystem, Stub, method, wsgi_app +from modal import Image, Volume, Stub, method, wsgi_app VOL_MOUNT_PATH = Path("/vol") @@ -37,7 +37,8 @@ ) stub = Stub(name="example-news-summarizer", image=image) -output_vol = NetworkFileSystem.persisted("finetune-vol") +output_vol = Volume.persisted("finetune-volume") +stub.volume = output_vol # ## Finetuning Flan-T5 on XSum dataset # @@ -47,7 +48,7 @@ @stub.function( gpu="A10g", timeout=7200, - network_file_systems={VOL_MOUNT_PATH: output_vol}, + volumes={VOL_MOUNT_PATH: output_vol}, ) def finetune(num_train_epochs: int = 1, size_percentage: int = 10): from datasets import load_dataset @@ -57,6 +58,7 @@ def finetune(num_train_epochs: int = 1, size_percentage: int = 10): DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, + TrainerCallback, ) # Use size percentage to retrieve subset of the dataset to iterate faster @@ -121,6 +123,17 @@ def preprocess(batch): pad_to_multiple_of=batch_size, ) + class CheckpointCallback(TrainerCallback): + def __init__(self, volume): + self.volume = volume + + def on_save(self, args, state, control, **kwargs): + """ + Event called after a checkpoint save. + """ + print("running commit on modal.Volume after model checkpoint") + self.volume.commit() + training_args = Seq2SeqTrainingArguments( # Save checkpoints to the mounted volume output_dir=str(VOL_MOUNT_PATH / "model"), @@ -142,6 +155,7 @@ def preprocess(batch): trainer = Seq2SeqTrainer( model=model, args=training_args, + callbacks=[CheckpointCallback(stub.app.volume)], data_collator=data_collator, train_dataset=tokenized_xsum_train, eval_dataset=tokenized_xsum_test, @@ -152,6 +166,7 @@ def preprocess(batch): # Save the trained model and tokenizer to the mounted volume model.save_pretrained(str(VOL_MOUNT_PATH / "model")) tokenizer.save_pretrained(str(VOL_MOUNT_PATH / "tokenizer")) + stub.app.volume.commit() # ## Monitoring Finetuning with Tensorboard @@ -159,7 +174,7 @@ def preprocess(batch): # Tensorboard is an application for visualizing training loss. In this example we # serve it as a Modal WSGI app. # -@stub.function(network_file_systems={VOL_MOUNT_PATH: output_vol}) +@stub.function(volumes={VOL_MOUNT_PATH: output_vol}) @wsgi_app() def monitor(): import tensorboard @@ -181,7 +196,7 @@ def monitor(): # -@stub.cls(network_file_systems={VOL_MOUNT_PATH: output_vol}) +@stub.cls(volumes={VOL_MOUNT_PATH: output_vol}) class Summarizer: def __enter__(self): from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline @@ -228,14 +243,14 @@ def main(): # Invoke model finetuning use the provided command below # # ```bash -# modal run --detach finetune.py::finetune --num-train-epochs=1 --size-percentage=10 +# modal run --detach flan_t5_finetune.py::finetune --num-train-epochs=1 --size-percentage=10 # View the tensorboard logs at https://--example-news-summarizer-monitor-dev.modal.run # ``` # # Invoke finetuned model inference via local entrypoint # # ```bash -# modal run finetune.py +# modal run flan_t5_finetune.py # World number one Tiger Woods missed the cut at the US Open as he failed to qualify for the final round of the event in Los Angeles. # ``` #