small cleanup

interTwin-eu · Dec 18, 2024 · 72a0f37 · 72a0f37
1 parent 4301251
commit 72a0f37
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 24 deletions.
diff --git a/tutorials/distributed-ml/torch-scaling-test/config/base.yaml b/tutorials/distributed-ml/torch-scaling-test/config/base.yaml
@@ -16,7 +16,7 @@ prefetch: 2
 
 # Model
 batch_size: 64 # micro batch size
-epochs: 3
+epochs: 10
 lr: 0.001
 momentum: 0.5
 shuff: False

diff --git a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
@@ -30,7 +30,7 @@ def main():
     parser = get_parser()
     args = parser.parse_args()
 
-    subset_size = 5000 # limit number of examples from imagenet
+    subset_size = 5000  # limit number of examples from imagenet
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     is_distributed = use_cuda and torch.cuda.device_count() > 0
     torch_seed = set_seed(args.rnd_seed, deterministic_cudnn=False)
@@ -72,7 +72,7 @@ def main():
             worker_init_fn=seed_worker,
         )
 
-    device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu", local_rank)
+    device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu")
     model = torchvision.models.resnet152().to(device)
 
     # Distributing the model to the workers

diff --git a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py
@@ -38,7 +38,6 @@ def main():
     is_distributed = use_cuda and torch.cuda.device_count() > 0
     # torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)
 
-
     train_dataset = imagenet_dataset(args.data_dir, subset_size=subset_size)
     if is_distributed:
         deepspeed.init_distributed(dist_backend=args.backend)
@@ -78,10 +77,10 @@ def main():
         #     worker_init_fn=seed_worker,
         # )
 
-    device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu", local_rank)
+    device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu")
 
     # Create CNN model
-    model = torchvision.models.resnet152().to(device)
+    model = torchvision.models.resnet152()
 
     # Initialize DeepSpeed and get:
     # 1) Distributed model

diff --git a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py
@@ -89,7 +89,7 @@ def main():
             worker_init_fn=seed_worker,
         )
 
-    device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu", local_rank)
+    device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu")
 
     # Create CNN model
     model = torchvision.models.resnet152()

diff --git a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py
@@ -46,11 +46,15 @@ def parse_params() -> argparse.Namespace:
 
     >>> train.py --strategy ddp --config base-config.yaml --config foo.yaml
     """
-    parser = ItwinaiArgParser(description='PyTorch Imagenet Example')
+    parser = ItwinaiArgParser(description="PyTorch Imagenet Example")
 
     # Distributed ML strategy
     parser.add_argument(
-        "--strategy", "-s", type=str, choices=["ddp", "horovod", "deepspeed"], default="ddp"
+        "--strategy",
+        "-s",
+        type=str,
+        choices=["ddp", "horovod", "deepspeed"],
+        default="ddp",
     )
 
     # Data and logging
@@ -59,9 +63,13 @@ def parse_params() -> argparse.Namespace:
         default="./",
         help=("location of the training dataset in the local " "filesystem"),
     )
-    parser.add_argument("--log-int", type=int, default=10, help="log interval per training")
     parser.add_argument(
-        "--verbose", action=argparse.BooleanOptionalAction, help="Print parsed arguments"
+        "--log-int", type=int, default=10, help="log interval per training"
+    )
+    parser.add_argument(
+        "--verbose",
+        action=argparse.BooleanOptionalAction,
+        help="Print parsed arguments",
     )
     parser.add_argument(
         "--nworker",
@@ -70,7 +78,10 @@ def parse_params() -> argparse.Namespace:
         help=("number of workers in DataLoader (default: 0 -" " only main)"),
     )
     parser.add_argument(
-        "--prefetch", type=int, default=2, help="prefetch data in DataLoader (default: 2)"
+        "--prefetch",
+        type=int,
+        default=2,
+        help="prefetch data in DataLoader (default: 2)",
     )
 
     # Model
@@ -83,12 +94,20 @@ def parse_params() -> argparse.Namespace:
     parser.add_argument(
         "--epochs", type=int, default=10, help="number of epochs to train (default: 10)"
     )
-    parser.add_argument("--lr", type=float, default=0.01, help="learning rate (default: 0.01)")
     parser.add_argument(
-        "--momentum", type=float, default=0.5, help="momentum in SGD optimizer (default: 0.5)"
+        "--lr", type=float, default=0.01, help="learning rate (default: 0.01)"
+    )
+    parser.add_argument(
+        "--momentum",
+        type=float,
+        default=0.5,
+        help="momentum in SGD optimizer (default: 0.5)",
     )
     parser.add_argument(
-        "--shuff", action="store_true", default=False, help="shuffle dataset (default: False)"
+        "--shuff",
+        action="store_true",
+        default=False,
+        help="shuffle dataset (default: False)",
     )
 
     # Reproducibility
@@ -148,7 +167,13 @@ def parse_params() -> argparse.Namespace:
 
 
 def train(
-    model, device, train_loader, optimizer, epoch, strategy: TorchDistributedStrategy, args
+    model,
+    device,
+    train_loader,
+    optimizer,
+    epoch,
+    strategy: TorchDistributedStrategy,
+    args,
 ):
     """
     Training function, representing an epoch.
@@ -167,7 +192,11 @@ def train(
         loss = F.nll_loss(output, target)
         loss.backward()
         optimizer.step()
-        if strategy.is_main_worker and args.log_int > 0 and batch_idx % args.log_int == 0:
+        if (
+            strategy.is_main_worker
+            and args.log_int > 0
+            and batch_idx % args.log_int == 0
+        ):
             print(
                 f"Train epoch: {epoch} "
                 f"[{batch_idx * len(data)}/{len(train_loader.dataset)/gwsize} "
@@ -207,7 +236,9 @@ def main():
             config_params=dict(train_micro_batch_size_per_gpu=args.batch_size)
         )
     else:
-        raise NotImplementedError(f"Strategy {args.strategy} is not recognized/implemented.")
+        raise NotImplementedError(
+            f"Strategy {args.strategy} is not recognized/implemented."
+        )
     strategy.init()
 
     # Check resources availability
@@ -309,10 +340,12 @@ def main():
         print("TIMER: broadcast:", timer() - st, "s")
         print("\nDEBUG: start training")
         print("--------------------------------------------------------")
-        nnod = os.environ.get("SLURM_NNODES", "unk")
+        nnod = os.environ.get("SLURM_NNODES", "1")
         s_name = f"{args.strategy}-it"
         epoch_time_tracker = EpochTimeTracker(
-            strategy_name=s_name, save_path=f"epochtime_{s_name}_{nnod}N.csv"
+            strategy_name=s_name,
+            save_path=f"epochtime_{s_name}_{nnod}N.csv",
+            num_nodes=int(nnod),
         )
 
     et = timer()

diff --git a/tutorials/distributed-ml/torch-scaling-test/runall.sh b/tutorials/distributed-ml/torch-scaling-test/runall.sh
@@ -22,7 +22,6 @@ PYTHON_VENV="../../../.venv"
 LOG_DIR="logs_slurm"
 
 # Common options
-CMD="--nodes=$N --time=$T --account=intertwin --partition=batch slurm.sh"
 PYTHON_VENV="../../../envAI_hdfml"
 
 echo "Distributing training over $N nodes. Timeout set to: $T"

diff --git a/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml b/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml
@@ -9,11 +9,11 @@ gpus_per_node: 4
 cpus_per_gpu: 4
 
 mode: single # "single", "runall" or "scaling-test" - defaults to "single"
-dist_strat: ddp # "ddp", "deepspeed" or "horovod"
+dist_strat: deepspeed # "ddp", "deepspeed" or "horovod"
 itwinai_trainer: false
 
 account: intertwin
-time: 00:10:00
+time: 00:20:00
 partition: develbooster
 
 # Keep in mind that these will be overwritten if "mode" is not "single", and that
@@ -24,7 +24,6 @@ std_out: slurm_jobs/${dist_strat}-${num_nodes}x${gpus_per_node}.out
 err_out: slurm_jobs/${dist_strat}-${num_nodes}x${gpus_per_node}.err
 job_name: tutorial-${dist_strat}-job
 
-
 python_venv: .venv
 
 # If you want to manually override the training command, comment in the following: