Skip to content

Commit

Permalink
enable epoch time logging for tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
jarlsondre committed Jan 7, 2025
1 parent 003a683 commit e50bd19
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/itwinai/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,7 +1177,7 @@ class EpochTimeTracker:
"""Tracker for epoch execution time during training."""

def __init__(
self, strategy_name: str, save_path: Union[Path, str], num_nodes: int
self, strategy_name: str, save_path: Path | str, num_nodes: int
) -> None:
if isinstance(save_path, str):
save_path = Path(save_path)
Expand Down
31 changes: 14 additions & 17 deletions tutorials/distributed-ml/torch-scaling-test/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,32 +103,29 @@ python slurm.py -c slurm_config.yaml --no-retain-file --no-submit-job --mode run
## Run scaling test

If you wish to run a scaling test, you can set the `mode` to `scaling-test`. This will
launch all the distributed strategies with different number of nodes.
launch all the distributed strategies with different number of nodes. An example of
this is

```bash
python slurm.py -c slurm_config.yaml --no-retain-file --no-submit-job --mode scaling-test
```

## Analyze results

Once all jobs have completed, you can automatically generate scalability report
using itwinai's CLI:
Once all jobs have completed, you can generate a scalability report using the following
command:

```bash
# First, activate you Python virtual environment
itwinai generate-scalability-plot

```

# For more info run
itwinai scalability-report --help
To see the full list of possible arguments, type:

# Generate a scalability report
itwinai scalability-report --pattern="^epoch.+\.csv$" \
--plot-title "ResNet152 on Imagenet" --archive imagenet_results
```bash
itwinai generate-scalability-plot --help
```

The last command prints to terminal the average epoch time per training
configuration and per number of nodes, and it generated scaling test
analysis plot, which is saved as `.png` file. This command will also
create a `.tar.gz` archive of all the analyzed `.csv` files and
the generated plots, allowing you to easily organize different experiments
and reducing the risk of overwriting the logs generated during the scaling
test.

Example of scalability plot generated by `itwinai scalability-report`:

![report](img/report.png)
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ data_dir: /p/scratch/intertwin/datasets/imagenet/ILSVRC2012/train/ # tmp_data/

# Subset size can be an int or None. Cannot be larger than the length of the dataset.
# If you wish to set it to "None", you must use "null" as that is what yaml expects
subset_size: null
subset_size: 5000
log_int: 10

# verbose: True
Expand Down
6 changes: 5 additions & 1 deletion tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import os
from timeit import default_timer as timer
from pathlib import Path

import torch
import torch.distributed as dist
Expand Down Expand Up @@ -82,9 +83,11 @@ def main():

if global_rank == 0:
num_nodes = os.environ.get("SLURM_NNODES", 1)
save_dir = Path("scalability-metrics")
save_path = save_dir / f"epochtime_ddp-bl_{num_nodes}N.csv"
epoch_time_tracker = EpochTimeTracker(
strategy_name="ddp-bl",
save_path=f"epochtime_ddp-bl_{num_nodes}N.csv",
save_path=save_path,
num_nodes=int(num_nodes),
)

Expand All @@ -107,6 +110,7 @@ def main():
if global_rank == 0:
total_time = timer() - start_time
print(f"Training finished - took {total_time:.2f}s")
epoch_time_tracker.save()

# Clean-up
if is_distributed:
Expand Down
14 changes: 11 additions & 3 deletions tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
#
# Credit:
# - Matteo Bunino <[email protected]> - CERN
# - Jarl Sondre Sæther <[email protected]> - CERN
# --------------------------------------------------------------------------------------

"""Scaling test of Microsoft Deepspeed on Imagenet using Resnet."""

import os
from timeit import default_timer as timer
from pathlib import Path

import deepspeed
import torch
Expand Down Expand Up @@ -98,12 +100,15 @@ def main():
# Start training loop
if global_rank == 0:
num_nodes = os.environ.get("SLURM_NNODES", "1")
save_dir = Path("scalability-metrics")
save_path = save_dir / f"epochtime_deepspeed-bl_{num_nodes}N.csv"
epoch_time_tracker = EpochTimeTracker(
strategy_name="deepspeed-bl",
save_path=f"epochtime_deepspeed-bl_{num_nodes}N.csv",
save_path=save_path,
num_nodes=int(num_nodes),
)

start_time = timer()
start_epoch = 1
for epoch_idx in range(start_epoch, args.epochs + 1):
epoch_start_time = timer()
Expand All @@ -123,11 +128,14 @@ def main():
epoch_time_tracker.add_epoch_time(epoch_idx, epoch_elapsed_time)
print(f"[{epoch_idx}/{args.epochs}] - time: {epoch_elapsed_time:.2f}s")

if is_distributed:
dist.barrier()
if global_rank == 0:
total_time = timer() - start_time
print(f"Training finished - took {total_time:.2f}s")
epoch_time_tracker.save()

# Clean-up
if is_distributed:
dist.barrier()
deepspeed.sys.exit()


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import os
from timeit import default_timer as timer
from pathlib import Path

import horovod.torch as hvd
import torch
Expand Down Expand Up @@ -114,9 +115,11 @@ def main():

if global_rank == 0:
num_nodes = os.environ.get("SLURM_NNODES", 1)
save_dir = Path("scalability-metrics")
save_path = save_dir / f"epochtime_horovod-bl_{num_nodes}N.csv"
epoch_time_tracker = EpochTimeTracker(
strategy_name="horovod-bl",
save_path=f"epochtime_horovod-bl_{num_nodes}N.csv",
save_path=save_path,
num_nodes=int(num_nodes),
)

Expand All @@ -143,6 +146,7 @@ def main():
if global_rank == 0:
total_time = timer() - start_time
print(f"Training finished - took {total_time:.2f}s")
epoch_time_tracker.save()


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# Credit:
# - Matteo Bunino <[email protected]> - CERN
# - Jarl Sondre Sæther <[email protected]> - CERN
# --------------------------------------------------------------------------------------

"""Show how to use DDP, Horovod and DeepSpeed strategies interchangeably
Expand All @@ -16,7 +17,7 @@
import os
import sys
from timeit import default_timer as timer
# from typing import Optional
from pathlib import Path

# import deepspeed
import horovod.torch as hvd
Expand Down Expand Up @@ -137,9 +138,11 @@ def main():
if strategy.is_main_worker:
num_nodes = os.environ.get("SLURM_NNODES", 1)
strategy_name = f"{args.strategy}-it"
save_dir = Path("scalability-metrics")
save_path = save_dir / f"epochtime_{strategy_name}_{num_nodes}N.csv"
epoch_time_tracker = EpochTimeTracker(
strategy_name=strategy_name,
save_path=f"epochtime_{strategy_name}_{num_nodes}N.csv",
save_path=save_path,
num_nodes=int(num_nodes),
)

Expand All @@ -165,6 +168,7 @@ def main():
if global_rank == 0:
total_time = timer() - start_time
print(f"Training finished - took {total_time:.2f}s")
epoch_time_tracker.save()

# Clean-up
if is_distributed:
Expand Down
4 changes: 2 additions & 2 deletions tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ gpus_per_node: 4
cpus_per_gpu: 4

mode: single # "single", "runall" or "scaling-test" - defaults to "single"
dist_strat: horovod # "ddp", "deepspeed" or "horovod"
dist_strat: ddp # "ddp", "deepspeed" or "horovod"
itwinai_trainer: false

account: intertwin
time: 00:10:00
time: 00:15:00
partition: develbooster

# Keep in mind that these will be overwritten if "mode" is not "single", and that
Expand Down
3 changes: 3 additions & 0 deletions tutorials/distributed-ml/torch-scaling-test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,7 @@ def get_parser() -> ItwinaiArgParser:
default=1.0,
help=("apply gradient pre-divide factor in optimizer " "(default: 1.0)"),
)
parser.add_argument(
"--strategy", "-s", type=str, choices=["ddp", "horovod", "deepspeed"], default="ddp"
)
return parser

0 comments on commit e50bd19

Please sign in to comment.