-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ADD: distributed tooling and examples
- Loading branch information
Showing
10 changed files
with
765 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
from typing import Tuple | ||
import abc | ||
|
||
from launcher import Launcher | ||
from strategy import Strategy, DDPStrategy | ||
from launcher_factory import TorchElasticLauncherFactory | ||
|
||
|
||
class Assembler(abc.ABC): | ||
"""Abstract Assembler class.""" | ||
|
||
|
||
class DistributedTooling(Assembler): | ||
""" | ||
Assembles a set of objects used to enable distributed ML. | ||
Suggests working presets of Launcher and Strategy, providing | ||
an easy entry point for the end user. | ||
""" | ||
|
||
def __init__(self, n_workers_per_node: int = 1) -> None: | ||
super().__init__() | ||
self.n_workers_per_node = n_workers_per_node | ||
|
||
def getTools(self, strategy: str) -> Tuple[Launcher, Strategy]: | ||
if strategy == 'ddp': | ||
return self.getTorchDDPTools() | ||
if strategy == 'deepspeed': | ||
return self.getDeepSpeedTools() | ||
if strategy == 'horovod': | ||
return self.getHorovodTools() | ||
raise ValueError(f"Unrecognized strategy={strategy}") | ||
|
||
def getTorchDDPTools(self) -> Tuple[Launcher, Strategy]: | ||
""" | ||
Returns a suggested preset of Launcher + Strategy | ||
for torch distributed data parallel. | ||
""" | ||
import torch | ||
if not torch.cuda.is_available(): | ||
raise RuntimeError( | ||
"Torch DDP cannot be used. GPUs not available." | ||
) | ||
if not torch.cuda.device_count() > 1: | ||
raise RuntimeError( | ||
"Torch DDP cannot be used. Only one GPU is available." | ||
) | ||
launcher_builder = TorchElasticLauncherFactory() | ||
elastic_launcher = launcher_builder.createLauncher( | ||
n_workers_per_node=self.n_workers_per_node | ||
) | ||
strategy = DDPStrategy(backend='nccl') | ||
return elastic_launcher, strategy | ||
|
||
def getDeepSpeedTools(self) -> Tuple[Launcher, Strategy]: | ||
""" | ||
Returns a suggested preset of Launcher + Strategy | ||
for DeepSpeed distributed ML. | ||
""" | ||
# TODO: complete | ||
raise NotImplementedError | ||
|
||
def getHorovodTools(self) -> Tuple[Launcher, Strategy]: | ||
""" | ||
Returns a suggested preset of Launcher + Strategy | ||
for Horovod distributed ML. | ||
""" | ||
# TODO: complete | ||
raise NotImplementedError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
""" | ||
Run this with torchrun | ||
""" | ||
|
||
import os | ||
|
||
import torch | ||
from torch import nn | ||
from torch.utils.data import DataLoader, Dataset | ||
|
||
from strategy import Strategy, DDPStrategy, HorovodStrategy | ||
|
||
|
||
class UniformRndDataset(Dataset): | ||
def __init__(self, x_size: int, y_size: int, len: int = 100): | ||
super().__init__() | ||
self.x_size = x_size | ||
self.y_size = y_size | ||
self.len = len | ||
|
||
def __len__(self): | ||
return self.len | ||
|
||
def __getitem__(self, index): | ||
return torch.rand(self.x_size), torch.rand(self.y_size) | ||
|
||
|
||
def trainer_entrypoint_fn(a, strategy: Strategy): | ||
"""Dummy training function.""" | ||
strategy.setup() | ||
print(f"{a}: {os.environ.get('RANK')} {os.environ.get('LOCAL_RANK')} " | ||
f"{os.environ.get('MASTER_ADDR')} {os.environ.get('MASTER_PORT')}") | ||
|
||
# Local model | ||
model = nn.Linear(3, 4) | ||
optim = torch.optim.Adam(model.parameters(), lr=1e-3) | ||
loss_fn = nn.MSELoss() | ||
# Distributed model | ||
model: nn.Module = strategy.distribute_model(model) | ||
optim: torch.optim.Optimizer = strategy.distribute_optimizer(optim) | ||
|
||
# Data | ||
train_set = UniformRndDataset(x_size=3, y_size=4) | ||
train_loader = DataLoader(train_set, batch_size=10, num_workers=1) | ||
# Distributed dataloader | ||
train_loader: DataLoader = strategy.distribute_dataloader(train_loader) | ||
|
||
for epoch in range(2): | ||
for (x, y) in train_loader: | ||
# print(f"tensor to cuda:{strategy.device}") | ||
x = x.to(strategy.device) | ||
y = y.to(strategy.device) | ||
|
||
optim.zero_grad() | ||
y_pred = model(x) | ||
loss = loss_fn(y_pred, y) | ||
loss.backward() | ||
optim.step() | ||
|
||
if strategy.is_main_worker(): | ||
print(f"Loss [epoch={epoch}]: {loss.item()}") | ||
|
||
strategy.teardown() | ||
return 123 | ||
|
||
|
||
STRATEGY = 'ddp' | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
# Instantiate Strategy | ||
if STRATEGY == 'ddp': | ||
if (not torch.cuda.is_available() | ||
or not torch.cuda.device_count() > 1): | ||
raise RuntimeError('Resources unavailable') | ||
|
||
strategy = DDPStrategy(cluster=None, backend='nccl') | ||
elif STRATEGY == 'horovod': | ||
strategy = HorovodStrategy() | ||
else: | ||
raise NotImplementedError | ||
|
||
# Launch distributed training | ||
trainer_entrypoint_fn("foobar", strategy) |
Oops, something went wrong.