Skip to content

Commit

Permalink
small cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
jarlsondre committed Dec 18, 2024
1 parent 4301251 commit 72a0f37
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ prefetch: 2

# Model
batch_size: 64 # micro batch size
epochs: 3
epochs: 10
lr: 0.001
momentum: 0.5
shuff: False
Expand Down
4 changes: 2 additions & 2 deletions tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def main():
parser = get_parser()
args = parser.parse_args()

subset_size = 5000 # limit number of examples from imagenet
subset_size = 5000 # limit number of examples from imagenet
use_cuda = not args.no_cuda and torch.cuda.is_available()
is_distributed = use_cuda and torch.cuda.device_count() > 0
torch_seed = set_seed(args.rnd_seed, deterministic_cudnn=False)
Expand Down Expand Up @@ -72,7 +72,7 @@ def main():
worker_init_fn=seed_worker,
)

device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu", local_rank)
device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu")
model = torchvision.models.resnet152().to(device)

# Distributing the model to the workers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def main():
is_distributed = use_cuda and torch.cuda.device_count() > 0
# torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)


train_dataset = imagenet_dataset(args.data_dir, subset_size=subset_size)
if is_distributed:
deepspeed.init_distributed(dist_backend=args.backend)
Expand Down Expand Up @@ -78,10 +77,10 @@ def main():
# worker_init_fn=seed_worker,
# )

device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu", local_rank)
device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu")

# Create CNN model
model = torchvision.models.resnet152().to(device)
model = torchvision.models.resnet152()

# Initialize DeepSpeed and get:
# 1) Distributed model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def main():
worker_init_fn=seed_worker,
)

device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu", local_rank)
device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu")

# Create CNN model
model = torchvision.models.resnet152()
Expand Down
59 changes: 46 additions & 13 deletions tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,15 @@ def parse_params() -> argparse.Namespace:
>>> train.py --strategy ddp --config base-config.yaml --config foo.yaml
"""
parser = ItwinaiArgParser(description='PyTorch Imagenet Example')
parser = ItwinaiArgParser(description="PyTorch Imagenet Example")

# Distributed ML strategy
parser.add_argument(
"--strategy", "-s", type=str, choices=["ddp", "horovod", "deepspeed"], default="ddp"
"--strategy",
"-s",
type=str,
choices=["ddp", "horovod", "deepspeed"],
default="ddp",
)

# Data and logging
Expand All @@ -59,9 +63,13 @@ def parse_params() -> argparse.Namespace:
default="./",
help=("location of the training dataset in the local " "filesystem"),
)
parser.add_argument("--log-int", type=int, default=10, help="log interval per training")
parser.add_argument(
"--verbose", action=argparse.BooleanOptionalAction, help="Print parsed arguments"
"--log-int", type=int, default=10, help="log interval per training"
)
parser.add_argument(
"--verbose",
action=argparse.BooleanOptionalAction,
help="Print parsed arguments",
)
parser.add_argument(
"--nworker",
Expand All @@ -70,7 +78,10 @@ def parse_params() -> argparse.Namespace:
help=("number of workers in DataLoader (default: 0 -" " only main)"),
)
parser.add_argument(
"--prefetch", type=int, default=2, help="prefetch data in DataLoader (default: 2)"
"--prefetch",
type=int,
default=2,
help="prefetch data in DataLoader (default: 2)",
)

# Model
Expand All @@ -83,12 +94,20 @@ def parse_params() -> argparse.Namespace:
parser.add_argument(
"--epochs", type=int, default=10, help="number of epochs to train (default: 10)"
)
parser.add_argument("--lr", type=float, default=0.01, help="learning rate (default: 0.01)")
parser.add_argument(
"--momentum", type=float, default=0.5, help="momentum in SGD optimizer (default: 0.5)"
"--lr", type=float, default=0.01, help="learning rate (default: 0.01)"
)
parser.add_argument(
"--momentum",
type=float,
default=0.5,
help="momentum in SGD optimizer (default: 0.5)",
)
parser.add_argument(
"--shuff", action="store_true", default=False, help="shuffle dataset (default: False)"
"--shuff",
action="store_true",
default=False,
help="shuffle dataset (default: False)",
)

# Reproducibility
Expand Down Expand Up @@ -148,7 +167,13 @@ def parse_params() -> argparse.Namespace:


def train(
model, device, train_loader, optimizer, epoch, strategy: TorchDistributedStrategy, args
model,
device,
train_loader,
optimizer,
epoch,
strategy: TorchDistributedStrategy,
args,
):
"""
Training function, representing an epoch.
Expand All @@ -167,7 +192,11 @@ def train(
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if strategy.is_main_worker and args.log_int > 0 and batch_idx % args.log_int == 0:
if (
strategy.is_main_worker
and args.log_int > 0
and batch_idx % args.log_int == 0
):
print(
f"Train epoch: {epoch} "
f"[{batch_idx * len(data)}/{len(train_loader.dataset)/gwsize} "
Expand Down Expand Up @@ -207,7 +236,9 @@ def main():
config_params=dict(train_micro_batch_size_per_gpu=args.batch_size)
)
else:
raise NotImplementedError(f"Strategy {args.strategy} is not recognized/implemented.")
raise NotImplementedError(
f"Strategy {args.strategy} is not recognized/implemented."
)
strategy.init()

# Check resources availability
Expand Down Expand Up @@ -309,10 +340,12 @@ def main():
print("TIMER: broadcast:", timer() - st, "s")
print("\nDEBUG: start training")
print("--------------------------------------------------------")
nnod = os.environ.get("SLURM_NNODES", "unk")
nnod = os.environ.get("SLURM_NNODES", "1")
s_name = f"{args.strategy}-it"
epoch_time_tracker = EpochTimeTracker(
strategy_name=s_name, save_path=f"epochtime_{s_name}_{nnod}N.csv"
strategy_name=s_name,
save_path=f"epochtime_{s_name}_{nnod}N.csv",
num_nodes=int(nnod),
)

et = timer()
Expand Down
1 change: 0 additions & 1 deletion tutorials/distributed-ml/torch-scaling-test/runall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ PYTHON_VENV="../../../.venv"
LOG_DIR="logs_slurm"

# Common options
CMD="--nodes=$N --time=$T --account=intertwin --partition=batch slurm.sh"
PYTHON_VENV="../../../envAI_hdfml"

echo "Distributing training over $N nodes. Timeout set to: $T"
Expand Down
5 changes: 2 additions & 3 deletions tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ gpus_per_node: 4
cpus_per_gpu: 4

mode: single # "single", "runall" or "scaling-test" - defaults to "single"
dist_strat: ddp # "ddp", "deepspeed" or "horovod"
dist_strat: deepspeed # "ddp", "deepspeed" or "horovod"
itwinai_trainer: false

account: intertwin
time: 00:10:00
time: 00:20:00
partition: develbooster

# Keep in mind that these will be overwritten if "mode" is not "single", and that
Expand All @@ -24,7 +24,6 @@ std_out: slurm_jobs/${dist_strat}-${num_nodes}x${gpus_per_node}.out
err_out: slurm_jobs/${dist_strat}-${num_nodes}x${gpus_per_node}.err
job_name: tutorial-${dist_strat}-job


python_venv: .venv

# If you want to manually override the training command, comment in the following:
Expand Down

0 comments on commit 72a0f37

Please sign in to comment.