Skip to content

Commit

Permalink
some cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
jarlsondre committed Dec 17, 2024
1 parent 9f7b1ab commit 4301251
Show file tree
Hide file tree
Showing 9 changed files with 61 additions and 24 deletions.
3 changes: 2 additions & 1 deletion src/itwinai/torch/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,8 @@ def set_epoch(self, epoch: int) -> None:
Args:
epoch (int): epoch number, from 0 to ``epochs-1``.
"""
if self.profiler is not None:
if self.profiler is not None and epoch > 0:
# We don't want to start stepping until after the first epoch
self.profiler.step()
self._set_epoch_dataloaders(epoch)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# Data and logging
data_dir: /p/scratch/intertwin/datasets/imagenet/ILSVRC2012/train/ # tmp_data/
log_int: 10
verbose: True
# verbose: True
nworker: 4 # num workers dataloader
prefetch: 2

Expand Down
2 changes: 1 addition & 1 deletion tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import torchvision
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from utils import imagenet_dataset, get_parser, train_epoch
from utils import get_parser, imagenet_dataset, train_epoch

from itwinai.loggers import EpochTimeTracker
from itwinai.torch.reproducibility import seed_worker, set_seed
Expand Down
28 changes: 15 additions & 13 deletions tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,18 @@
import os
from timeit import default_timer as timer


import deepspeed
import torch
import torch.distributed as dist
import torchvision
from torch.utils.data import DataLoader

# from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from utils import imagenet_dataset, get_parser, train_epoch
from utils import get_parser, imagenet_dataset, train_epoch

from itwinai.loggers import EpochTimeTracker
from itwinai.torch.reproducibility import set_seed

# from itwinai.torch.reproducibility import set_seed


def main():
Expand All @@ -32,20 +33,19 @@ def main():
args = parser.parse_args()

# Check resources availability
subset_size = 5000 # limit number of examples from imagenet
subset_size = 5000 # limit number of examples from imagenet
use_cuda = not args.no_cuda and torch.cuda.is_available()
is_distributed = use_cuda and torch.cuda.device_count() > 0
torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)
# torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)

st = timer()

train_dataset = imagenet_dataset(args.data_dir, subset_size=subset_size)
if is_distributed:
deepspeed.init_distributed(dist_backend=args.backend)

local_world_size = torch.cuda.device_count()
local_world_size = torch.cuda.device_count()
global_rank = dist.get_rank()
local_rank = dist.get_rank() % local_world_size
local_rank = dist.get_rank() % local_world_size

shuffle = args.shuff and args.rnd_seed is None
# pin_memory=True
Expand Down Expand Up @@ -114,16 +114,19 @@ def main():
)

start_epoch = 1
for epoch in range(start_epoch, args.epochs + 1):
for epoch_idx in range(start_epoch, args.epochs + 1):
epoch_start_time = timer()
if is_distributed:
# Inform the sampler that a new epoch started: shuffle
# may be needed
train_sampler.set_epoch(epoch)
train_sampler.set_epoch(epoch_idx)

# Training
train_epoch(
model=distrib_model, device=device, train_loader=deepspeed_train_loader, optimizer=optimizer
model=distrib_model,
device=device,
train_loader=deepspeed_train_loader,
optimizer=optimizer,
)

if global_rank == 0:
Expand All @@ -134,7 +137,6 @@ def main():
if is_distributed:
dist.barrier()


# Clean-up
if is_distributed:
deepspeed.sys.exit()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import torchvision
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from utils import imagenet_dataset, get_parser, train_epoch
from utils import get_parser, imagenet_dataset, train_epoch

from itwinai.loggers import EpochTimeTracker
from itwinai.torch.reproducibility import seed_worker, set_seed
Expand Down
35 changes: 35 additions & 0 deletions tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# If you use this with in the runall or scaling mode, keep in mind that the strategies
# will change, as well as the number of nodes.
#
# Default arguments can be seen in src/itwinai/slurm/utils.py
#
num_nodes: 1
num_tasks_per_node: 1
gpus_per_node: 4
cpus_per_gpu: 4

mode: single # "single", "runall" or "scaling-test" - defaults to "single"
dist_strat: ddp # "ddp", "deepspeed" or "horovod"
itwinai_trainer: false

account: intertwin
time: 00:10:00
partition: develbooster

# Keep in mind that these will be overwritten if "mode" is not "single", and that
# if you override the dist_strat in the CLI, then these will already have evaluated
# and thus might not correspond. Thus, we suggest you only change the dist_strat in
# the config and avoid overriding it in the CLI.
std_out: slurm_jobs/${dist_strat}-${num_nodes}x${gpus_per_node}.out
err_out: slurm_jobs/${dist_strat}-${num_nodes}x${gpus_per_node}.err
job_name: tutorial-${dist_strat}-job


python_venv: .venv

# If you want to manually override the training command, comment in the following:
# training_cmd: |
# $(which itwinai) exec-pipeline \
# --config ${config_file} \
# --pipe-key ${pipe_key} \
# -o strategy=${dist_strat} \
6 changes: 2 additions & 4 deletions tutorials/distributed-ml/torch-scaling-test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
# - Matteo Bunino <[email protected]> - CERN
# - Jarl Sondre Sæther <[email protected]> - CERN
# --------------------------------------------------------------------------------------
import argparse

import torch.nn as nn
import torch.nn.functional as F
from torch import device
Expand All @@ -35,11 +33,11 @@ def imagenet_dataset(data_root: str, subset_size: int | None = None):
)
imagenet = datasets.ImageFolder(root=data_root, transform=transform)

if subset_size is None:
if subset_size is None:
# We do this because we always want to return an instance of a subset, to make
# everything as consistent as possible
subset_size = len(imagenet)
if subset_size > len(imagenet):
if subset_size > len(imagenet):
raise ValueError("Limit higher than the total length of the dataset")

return Subset(imagenet, range(subset_size))
Expand Down
4 changes: 2 additions & 2 deletions use-cases/eurac/slurm_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
# Default arguments can be seen in src/itwinai/slurm/utils.py

mode: single # "single", "runall" or "scaling-test" - defaults to "single"
dist_strat: horovod # "ddp", "deepspeed" or "horovod"
dist_strat: ddp # "ddp", "deepspeed" or "horovod"

account: intertwin
time: 00:11:11
time: 00:05:00
partition: develbooster

# Keep in mind that these will be overwritten if "mode" is not "single", and that
Expand Down
3 changes: 2 additions & 1 deletion use-cases/eurac/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,8 @@ def create_model_loss_optimizer(self) -> None:
)

def set_epoch(self, epoch: int):
if self.profiler is not None:
if self.profiler is not None and epoch > 0:
# We don't want to start stepping until after the first epoch
self.profiler.step()

if self.strategy.is_distributed:
Expand Down

0 comments on commit 4301251

Please sign in to comment.