some cleanup

interTwin-eu · Dec 17, 2024 · 4301251 · 4301251
1 parent 9f7b1ab
commit 4301251
Show file tree

Hide file tree

Showing 9 changed files with 61 additions and 24 deletions.
diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py
@@ -422,7 +422,8 @@ def set_epoch(self, epoch: int) -> None:
         Args:
             epoch (int): epoch number, from 0 to ``epochs-1``.
         """
-        if self.profiler is not None:
+        if self.profiler is not None and epoch > 0:
+            # We don't want to start stepping until after the first epoch
             self.profiler.step()
         self._set_epoch_dataloaders(epoch)
 

diff --git a/tutorials/distributed-ml/torch-scaling-test/config/base.yaml b/tutorials/distributed-ml/torch-scaling-test/config/base.yaml
@@ -10,7 +10,7 @@
 # Data and logging
 data_dir: /p/scratch/intertwin/datasets/imagenet/ILSVRC2012/train/ # tmp_data/
 log_int: 10
-verbose: True
+# verbose: True
 nworker: 4 # num workers dataloader
 prefetch: 2
 

diff --git a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
@@ -20,7 +20,7 @@
 import torchvision
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
-from utils import imagenet_dataset, get_parser, train_epoch
+from utils import get_parser, imagenet_dataset, train_epoch
 
 from itwinai.loggers import EpochTimeTracker
 from itwinai.torch.reproducibility import seed_worker, set_seed

diff --git a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py
@@ -12,17 +12,18 @@
 import os
 from timeit import default_timer as timer
 
-
 import deepspeed
 import torch
 import torch.distributed as dist
 import torchvision
-from torch.utils.data import DataLoader
+
+# from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
-from utils import imagenet_dataset, get_parser, train_epoch
+from utils import get_parser, imagenet_dataset, train_epoch
 
 from itwinai.loggers import EpochTimeTracker
-from itwinai.torch.reproducibility import set_seed
+
+# from itwinai.torch.reproducibility import set_seed
 
 
 def main():
@@ -32,20 +33,19 @@ def main():
     args = parser.parse_args()
 
     # Check resources availability
-    subset_size = 5000 # limit number of examples from imagenet
+    subset_size = 5000  # limit number of examples from imagenet
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     is_distributed = use_cuda and torch.cuda.device_count() > 0
-    torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)
+    # torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)
 
-    st = timer()
 
     train_dataset = imagenet_dataset(args.data_dir, subset_size=subset_size)
     if is_distributed:
         deepspeed.init_distributed(dist_backend=args.backend)
 
-        local_world_size = torch.cuda.device_count() 
+        local_world_size = torch.cuda.device_count()
         global_rank = dist.get_rank()
-        local_rank = dist.get_rank() % local_world_size 
+        local_rank = dist.get_rank() % local_world_size
 
         shuffle = args.shuff and args.rnd_seed is None
         # pin_memory=True
@@ -114,16 +114,19 @@ def main():
         )
 
     start_epoch = 1
-    for epoch in range(start_epoch, args.epochs + 1):
+    for epoch_idx in range(start_epoch, args.epochs + 1):
         epoch_start_time = timer()
         if is_distributed:
             # Inform the sampler that a new epoch started: shuffle
             # may be needed
-            train_sampler.set_epoch(epoch)
+            train_sampler.set_epoch(epoch_idx)
 
         # Training
         train_epoch(
-            model=distrib_model, device=device, train_loader=deepspeed_train_loader, optimizer=optimizer
+            model=distrib_model,
+            device=device,
+            train_loader=deepspeed_train_loader,
+            optimizer=optimizer,
         )
 
         if global_rank == 0:
@@ -134,7 +137,6 @@ def main():
     if is_distributed:
         dist.barrier()
 
-
     # Clean-up
     if is_distributed:
         deepspeed.sys.exit()

diff --git a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py
@@ -19,7 +19,7 @@
 import torchvision
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
-from utils import imagenet_dataset, get_parser, train_epoch
+from utils import get_parser, imagenet_dataset, train_epoch
 
 from itwinai.loggers import EpochTimeTracker
 from itwinai.torch.reproducibility import seed_worker, set_seed

diff --git a/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml b/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml
@@ -0,0 +1,35 @@
+# If you use this with in the runall or scaling mode, keep in mind that the strategies
+# will change, as well as the number of nodes. 
+#
+# Default arguments can be seen in src/itwinai/slurm/utils.py
+#
+num_nodes: 1
+num_tasks_per_node: 1
+gpus_per_node: 4
+cpus_per_gpu: 4
+
+mode: single # "single", "runall" or "scaling-test" - defaults to "single"
+dist_strat: ddp # "ddp", "deepspeed" or "horovod"
+itwinai_trainer: false
+
+account: intertwin
+time: 00:10:00
+partition: develbooster
+
+# Keep in mind that these will be overwritten if "mode" is not "single", and that
+# if you override the dist_strat in the CLI, then these will already have evaluated
+# and thus might not correspond. Thus, we suggest you only change the dist_strat in
+# the config and avoid overriding it in the CLI.
+std_out: slurm_jobs/${dist_strat}-${num_nodes}x${gpus_per_node}.out
+err_out: slurm_jobs/${dist_strat}-${num_nodes}x${gpus_per_node}.err
+job_name: tutorial-${dist_strat}-job
+
+
+python_venv: .venv
+
+# If you want to manually override the training command, comment in the following:
+# training_cmd: | 
+#   $(which itwinai) exec-pipeline \
+#   --config ${config_file} \
+#   --pipe-key ${pipe_key} \
+#   -o strategy=${dist_strat} \
diff --git a/tutorials/distributed-ml/torch-scaling-test/utils.py b/tutorials/distributed-ml/torch-scaling-test/utils.py
@@ -7,8 +7,6 @@
 # - Matteo Bunino <[email protected]> - CERN
 # - Jarl Sondre Sæther <[email protected]> - CERN
 # --------------------------------------------------------------------------------------
-import argparse
-
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import device
@@ -35,11 +33,11 @@ def imagenet_dataset(data_root: str, subset_size: int | None = None):
     )
     imagenet = datasets.ImageFolder(root=data_root, transform=transform)
 
-    if subset_size is None: 
+    if subset_size is None:
         # We do this because we always want to return an instance of a subset, to make
         # everything as consistent as possible
         subset_size = len(imagenet)
-    if subset_size > len(imagenet): 
+    if subset_size > len(imagenet):
         raise ValueError("Limit higher than the total length of the dataset")
 
     return Subset(imagenet, range(subset_size))

diff --git a/use-cases/eurac/slurm_config.yaml b/use-cases/eurac/slurm_config.yaml
@@ -4,10 +4,10 @@
 # Default arguments can be seen in src/itwinai/slurm/utils.py
 
 mode: single # "single", "runall" or "scaling-test" - defaults to "single"
-dist_strat: horovod # "ddp", "deepspeed" or "horovod"
+dist_strat: ddp # "ddp", "deepspeed" or "horovod"
 
 account: intertwin
-time: 00:11:11
+time: 00:05:00
 partition: develbooster
 
 # Keep in mind that these will be overwritten if "mode" is not "single", and that

diff --git a/use-cases/eurac/trainer.py b/use-cases/eurac/trainer.py
@@ -147,7 +147,8 @@ def create_model_loss_optimizer(self) -> None:
         )
 
     def set_epoch(self, epoch: int):
-        if self.profiler is not None:
+        if self.profiler is not None and epoch > 0:
+            # We don't want to start stepping until after the first epoch
             self.profiler.step()
 
         if self.strategy.is_distributed: