diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py index 6e89fbcb..6cfb9f9d 100644 --- a/src/itwinai/loggers.py +++ b/src/itwinai/loggers.py @@ -1177,7 +1177,7 @@ class EpochTimeTracker: """Tracker for epoch execution time during training.""" def __init__( - self, strategy_name: str, save_path: Union[Path, str], num_nodes: int + self, strategy_name: str, save_path: Path | str, num_nodes: int ) -> None: if isinstance(save_path, str): save_path = Path(save_path) diff --git a/tutorials/distributed-ml/torch-scaling-test/README.md b/tutorials/distributed-ml/torch-scaling-test/README.md index 550e28e2..3e23e20a 100644 --- a/tutorials/distributed-ml/torch-scaling-test/README.md +++ b/tutorials/distributed-ml/torch-scaling-test/README.md @@ -103,32 +103,29 @@ python slurm.py -c slurm_config.yaml --no-retain-file --no-submit-job --mode run ## Run scaling test If you wish to run a scaling test, you can set the `mode` to `scaling-test`. This will -launch all the distributed strategies with different number of nodes. +launch all the distributed strategies with different number of nodes. An example of +this is + +```bash +python slurm.py -c slurm_config.yaml --no-retain-file --no-submit-job --mode scaling-test +``` ## Analyze results -Once all jobs have completed, you can automatically generate scalability report -using itwinai's CLI: +Once all jobs have completed, you can generate a scalability report using the following +command: ```bash -# First, activate you Python virtual environment +itwinai generate-scalability-plot + +``` -# For more info run -itwinai scalability-report --help +To see the full list of possible arguments, type: -# Generate a scalability report -itwinai scalability-report --pattern="^epoch.+\.csv$" \ - --plot-title "ResNet152 on Imagenet" --archive imagenet_results +```bash +itwinai generate-scalability-plot --help ``` -The last command prints to terminal the average epoch time per training -configuration and per number of nodes, and it generated scaling test -analysis plot, which is saved as `.png` file. This command will also -create a `.tar.gz` archive of all the analyzed `.csv` files and -the generated plots, allowing you to easily organize different experiments -and reducing the risk of overwriting the logs generated during the scaling -test. - Example of scalability plot generated by `itwinai scalability-report`: ![report](img/report.png) diff --git a/tutorials/distributed-ml/torch-scaling-test/config/base.yaml b/tutorials/distributed-ml/torch-scaling-test/config/base.yaml index 5de3ebd7..3ffd1760 100644 --- a/tutorials/distributed-ml/torch-scaling-test/config/base.yaml +++ b/tutorials/distributed-ml/torch-scaling-test/config/base.yaml @@ -12,7 +12,7 @@ data_dir: /p/scratch/intertwin/datasets/imagenet/ILSVRC2012/train/ # tmp_data/ # Subset size can be an int or None. Cannot be larger than the length of the dataset. # If you wish to set it to "None", you must use "null" as that is what yaml expects -subset_size: null +subset_size: 5000 log_int: 10 # verbose: True diff --git a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py index 36a55132..23962075 100755 --- a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py @@ -13,6 +13,7 @@ import os from timeit import default_timer as timer +from pathlib import Path import torch import torch.distributed as dist @@ -82,9 +83,11 @@ def main(): if global_rank == 0: num_nodes = os.environ.get("SLURM_NNODES", 1) + save_dir = Path("scalability-metrics") + save_path = save_dir / f"epochtime_ddp-bl_{num_nodes}N.csv" epoch_time_tracker = EpochTimeTracker( strategy_name="ddp-bl", - save_path=f"epochtime_ddp-bl_{num_nodes}N.csv", + save_path=save_path, num_nodes=int(num_nodes), ) @@ -107,6 +110,7 @@ def main(): if global_rank == 0: total_time = timer() - start_time print(f"Training finished - took {total_time:.2f}s") + epoch_time_tracker.save() # Clean-up if is_distributed: diff --git a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py index 58b9ee98..b74dd434 100644 --- a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py @@ -5,12 +5,14 @@ # # Credit: # - Matteo Bunino - CERN +# - Jarl Sondre Sæther - CERN # -------------------------------------------------------------------------------------- """Scaling test of Microsoft Deepspeed on Imagenet using Resnet.""" import os from timeit import default_timer as timer +from pathlib import Path import deepspeed import torch @@ -98,12 +100,15 @@ def main(): # Start training loop if global_rank == 0: num_nodes = os.environ.get("SLURM_NNODES", "1") + save_dir = Path("scalability-metrics") + save_path = save_dir / f"epochtime_deepspeed-bl_{num_nodes}N.csv" epoch_time_tracker = EpochTimeTracker( strategy_name="deepspeed-bl", - save_path=f"epochtime_deepspeed-bl_{num_nodes}N.csv", + save_path=save_path, num_nodes=int(num_nodes), ) + start_time = timer() start_epoch = 1 for epoch_idx in range(start_epoch, args.epochs + 1): epoch_start_time = timer() @@ -123,11 +128,14 @@ def main(): epoch_time_tracker.add_epoch_time(epoch_idx, epoch_elapsed_time) print(f"[{epoch_idx}/{args.epochs}] - time: {epoch_elapsed_time:.2f}s") - if is_distributed: - dist.barrier() + if global_rank == 0: + total_time = timer() - start_time + print(f"Training finished - took {total_time:.2f}s") + epoch_time_tracker.save() # Clean-up if is_distributed: + dist.barrier() deepspeed.sys.exit() diff --git a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py index 6ec1c44f..d51ce46c 100755 --- a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py @@ -12,6 +12,7 @@ import os from timeit import default_timer as timer +from pathlib import Path import horovod.torch as hvd import torch @@ -114,9 +115,11 @@ def main(): if global_rank == 0: num_nodes = os.environ.get("SLURM_NNODES", 1) + save_dir = Path("scalability-metrics") + save_path = save_dir / f"epochtime_horovod-bl_{num_nodes}N.csv" epoch_time_tracker = EpochTimeTracker( strategy_name="horovod-bl", - save_path=f"epochtime_horovod-bl_{num_nodes}N.csv", + save_path=save_path, num_nodes=int(num_nodes), ) @@ -143,6 +146,7 @@ def main(): if global_rank == 0: total_time = timer() - start_time print(f"Training finished - took {total_time:.2f}s") + epoch_time_tracker.save() if __name__ == "__main__": diff --git a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py index bc2dc8e9..a02a2fb6 100644 --- a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py @@ -5,6 +5,7 @@ # # Credit: # - Matteo Bunino - CERN +# - Jarl Sondre Sæther - CERN # -------------------------------------------------------------------------------------- """Show how to use DDP, Horovod and DeepSpeed strategies interchangeably @@ -16,7 +17,7 @@ import os import sys from timeit import default_timer as timer -# from typing import Optional +from pathlib import Path # import deepspeed import horovod.torch as hvd @@ -137,9 +138,11 @@ def main(): if strategy.is_main_worker: num_nodes = os.environ.get("SLURM_NNODES", 1) strategy_name = f"{args.strategy}-it" + save_dir = Path("scalability-metrics") + save_path = save_dir / f"epochtime_{strategy_name}_{num_nodes}N.csv" epoch_time_tracker = EpochTimeTracker( strategy_name=strategy_name, - save_path=f"epochtime_{strategy_name}_{num_nodes}N.csv", + save_path=save_path, num_nodes=int(num_nodes), ) @@ -165,6 +168,7 @@ def main(): if global_rank == 0: total_time = timer() - start_time print(f"Training finished - took {total_time:.2f}s") + epoch_time_tracker.save() # Clean-up if is_distributed: diff --git a/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml b/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml index aabaa7dc..3ca17247 100644 --- a/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml +++ b/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml @@ -9,11 +9,11 @@ gpus_per_node: 4 cpus_per_gpu: 4 mode: single # "single", "runall" or "scaling-test" - defaults to "single" -dist_strat: horovod # "ddp", "deepspeed" or "horovod" +dist_strat: ddp # "ddp", "deepspeed" or "horovod" itwinai_trainer: false account: intertwin -time: 00:10:00 +time: 00:15:00 partition: develbooster # Keep in mind that these will be overwritten if "mode" is not "single", and that diff --git a/tutorials/distributed-ml/torch-scaling-test/utils.py b/tutorials/distributed-ml/torch-scaling-test/utils.py index a1b5a6f8..e29beb55 100644 --- a/tutorials/distributed-ml/torch-scaling-test/utils.py +++ b/tutorials/distributed-ml/torch-scaling-test/utils.py @@ -157,4 +157,7 @@ def get_parser() -> ItwinaiArgParser: default=1.0, help=("apply gradient pre-divide factor in optimizer " "(default: 1.0)"), ) + parser.add_argument( + "--strategy", "-s", type=str, choices=["ddp", "horovod", "deepspeed"], default="ddp" + ) return parser