enable epoch time logging for tutorial

interTwin-eu · Jan 7, 2025 · e50bd19 · e50bd19
1 parent 003a683
commit e50bd19
Show file tree

Hide file tree

Showing 9 changed files with 48 additions and 28 deletions.
diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py
@@ -1177,7 +1177,7 @@ class EpochTimeTracker:
     """Tracker for epoch execution time during training."""
 
     def __init__(
-        self, strategy_name: str, save_path: Union[Path, str], num_nodes: int
+        self, strategy_name: str, save_path: Path | str, num_nodes: int
     ) -> None:
         if isinstance(save_path, str):
             save_path = Path(save_path)

diff --git a/tutorials/distributed-ml/torch-scaling-test/README.md b/tutorials/distributed-ml/torch-scaling-test/README.md
@@ -103,32 +103,29 @@ python slurm.py -c slurm_config.yaml --no-retain-file --no-submit-job --mode run
 ## Run scaling test
 
 If you wish to run a scaling test, you can set the `mode` to `scaling-test`. This will
-launch all the distributed strategies with different number of nodes. 
+launch all the distributed strategies with different number of nodes. An example of
+this is
+
+```bash
+python slurm.py -c slurm_config.yaml --no-retain-file --no-submit-job --mode scaling-test
+```
 
 ## Analyze results
 
-Once all jobs have completed, you can automatically generate scalability report
-using itwinai's CLI:
+Once all jobs have completed, you can generate a scalability report using the following
+command:
 
 ```bash
-# First, activate you Python virtual environment
+itwinai generate-scalability-plot 
+
+```
 
-# For more info run
-itwinai scalability-report --help
+To see the full list of possible arguments, type:
 
-# Generate a scalability report
-itwinai scalability-report --pattern="^epoch.+\.csv$" \
-    --plot-title "ResNet152 on Imagenet" --archive imagenet_results
+```bash
+itwinai generate-scalability-plot --help
 ```
 
-The last command prints to terminal the average epoch time per training
-configuration and per number of nodes, and it generated scaling test
-analysis plot, which is saved as `.png` file. This command will also
-create a `.tar.gz` archive of all the analyzed `.csv` files and  
-the generated plots, allowing you to easily organize different experiments
-and reducing the risk of overwriting the logs generated during the scaling
-test.
-
 Example of scalability plot generated by `itwinai scalability-report`:
 
 ![report](img/report.png)
diff --git a/tutorials/distributed-ml/torch-scaling-test/config/base.yaml b/tutorials/distributed-ml/torch-scaling-test/config/base.yaml
@@ -12,7 +12,7 @@ data_dir: /p/scratch/intertwin/datasets/imagenet/ILSVRC2012/train/ # tmp_data/
 
 # Subset size can be an int or None. Cannot be larger than the length of the dataset. 
 # If you wish to set it to "None", you must use "null" as that is what yaml expects
-subset_size: null 
+subset_size: 5000 
 log_int: 10
 
 # verbose: True

diff --git a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
@@ -13,6 +13,7 @@
 
 import os
 from timeit import default_timer as timer
+from pathlib import Path
 
 import torch
 import torch.distributed as dist
@@ -82,9 +83,11 @@ def main():
 
     if global_rank == 0:
         num_nodes = os.environ.get("SLURM_NNODES", 1)
+        save_dir = Path("scalability-metrics")
+        save_path = save_dir / f"epochtime_ddp-bl_{num_nodes}N.csv"
         epoch_time_tracker = EpochTimeTracker(
             strategy_name="ddp-bl",
-            save_path=f"epochtime_ddp-bl_{num_nodes}N.csv",
+            save_path=save_path,
             num_nodes=int(num_nodes),
         )
 
@@ -107,6 +110,7 @@ def main():
     if global_rank == 0:
         total_time = timer() - start_time
         print(f"Training finished - took {total_time:.2f}s")
+        epoch_time_tracker.save()
 
     # Clean-up
     if is_distributed:

diff --git a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py
@@ -5,12 +5,14 @@
 #
 # Credit:
 # - Matteo Bunino <[email protected]> - CERN
+# - Jarl Sondre Sæther <[email protected]> - CERN
 # --------------------------------------------------------------------------------------
 
 """Scaling test of Microsoft Deepspeed on Imagenet using Resnet."""
 
 import os
 from timeit import default_timer as timer
+from pathlib import Path
 
 import deepspeed
 import torch
@@ -98,12 +100,15 @@ def main():
     # Start training loop
     if global_rank == 0:
         num_nodes = os.environ.get("SLURM_NNODES", "1")
+        save_dir = Path("scalability-metrics")
+        save_path = save_dir / f"epochtime_deepspeed-bl_{num_nodes}N.csv"
         epoch_time_tracker = EpochTimeTracker(
             strategy_name="deepspeed-bl",
-            save_path=f"epochtime_deepspeed-bl_{num_nodes}N.csv",
+            save_path=save_path,
             num_nodes=int(num_nodes),
         )
 
+    start_time = timer()
     start_epoch = 1
     for epoch_idx in range(start_epoch, args.epochs + 1):
         epoch_start_time = timer()
@@ -123,11 +128,14 @@ def main():
             epoch_time_tracker.add_epoch_time(epoch_idx, epoch_elapsed_time)
             print(f"[{epoch_idx}/{args.epochs}] - time: {epoch_elapsed_time:.2f}s")
 
-    if is_distributed:
-        dist.barrier()
+    if global_rank == 0:
+        total_time = timer() - start_time
+        print(f"Training finished - took {total_time:.2f}s")
+        epoch_time_tracker.save()
 
     # Clean-up
     if is_distributed:
+        dist.barrier()
         deepspeed.sys.exit()
 
 

diff --git a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py
@@ -12,6 +12,7 @@
 
 import os
 from timeit import default_timer as timer
+from pathlib import Path
 
 import horovod.torch as hvd
 import torch
@@ -114,9 +115,11 @@ def main():
 
     if global_rank == 0:
         num_nodes = os.environ.get("SLURM_NNODES", 1)
+        save_dir = Path("scalability-metrics")
+        save_path = save_dir / f"epochtime_horovod-bl_{num_nodes}N.csv"
         epoch_time_tracker = EpochTimeTracker(
             strategy_name="horovod-bl",
-            save_path=f"epochtime_horovod-bl_{num_nodes}N.csv",
+            save_path=save_path,
             num_nodes=int(num_nodes),
         )
 
@@ -143,6 +146,7 @@ def main():
     if global_rank == 0:
         total_time = timer() - start_time
         print(f"Training finished - took {total_time:.2f}s")
+        epoch_time_tracker.save()
 
 
 if __name__ == "__main__":

diff --git a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py
@@ -5,6 +5,7 @@
 #
 # Credit:
 # - Matteo Bunino <[email protected]> - CERN
+# - Jarl Sondre Sæther <[email protected]> - CERN
 # --------------------------------------------------------------------------------------
 
 """Show how to use DDP, Horovod and DeepSpeed strategies interchangeably
@@ -16,7 +17,7 @@
 import os
 import sys
 from timeit import default_timer as timer
-# from typing import Optional
+from pathlib import Path
 
 # import deepspeed
 import horovod.torch as hvd
@@ -137,9 +138,11 @@ def main():
     if strategy.is_main_worker:
         num_nodes = os.environ.get("SLURM_NNODES", 1)
         strategy_name = f"{args.strategy}-it"
+        save_dir = Path("scalability-metrics")
+        save_path = save_dir / f"epochtime_{strategy_name}_{num_nodes}N.csv"
         epoch_time_tracker = EpochTimeTracker(
             strategy_name=strategy_name,
-            save_path=f"epochtime_{strategy_name}_{num_nodes}N.csv",
+            save_path=save_path,
             num_nodes=int(num_nodes),
         )
 
@@ -165,6 +168,7 @@ def main():
     if global_rank == 0:
         total_time = timer() - start_time
         print(f"Training finished - took {total_time:.2f}s")
+        epoch_time_tracker.save()
 
     # Clean-up
     if is_distributed:

diff --git a/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml b/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml
@@ -9,11 +9,11 @@ gpus_per_node: 4
 cpus_per_gpu: 4
 
 mode: single # "single", "runall" or "scaling-test" - defaults to "single"
-dist_strat: horovod # "ddp", "deepspeed" or "horovod"
+dist_strat: ddp # "ddp", "deepspeed" or "horovod"
 itwinai_trainer: false
 
 account: intertwin
-time: 00:10:00
+time: 00:15:00
 partition: develbooster
 
 # Keep in mind that these will be overwritten if "mode" is not "single", and that

diff --git a/tutorials/distributed-ml/torch-scaling-test/utils.py b/tutorials/distributed-ml/torch-scaling-test/utils.py
@@ -157,4 +157,7 @@ def get_parser() -> ItwinaiArgParser:
         default=1.0,
         help=("apply gradient pre-divide factor in optimizer " "(default: 1.0)"),
     )
+    parser.add_argument(
+        "--strategy", "-s", type=str, choices=["ddp", "horovod", "deepspeed"], default="ddp"
+    )
     return parser