diff --git a/README.md b/README.md index a66de09a..55c8484a 100644 --- a/README.md +++ b/README.md @@ -96,11 +96,25 @@ adding the `dev` extra: pip install -e .[dev] ``` -To **run tests** on itwinai package: +#### Test with `pytest` + +To run tests on itwinai package: ```bash # Activate env micromamba activate ./.venv-pytorch # or ./.venv-tf -pytest -v tests/ +pytest -v -m "not slurm" tests/ +``` + +However, some tests are intended to be executed only on an HPC system, +where SLURM is available. They are marked with "slurm" tag. To run also +those tests, use the dedicated job script: + +```bash +sbatch tests/slurm_tests_startscript + +# Upon completion, check the output: +cat job.err +cat job.out ``` diff --git a/tests/slurm_tests_startscript b/tests/slurm_tests_startscript new file mode 100644 index 00000000..f7540fab --- /dev/null +++ b/tests/slurm_tests_startscript @@ -0,0 +1,32 @@ +#!/bin/bash + +# general configuration of the job +#SBATCH --job-name=PrototypeTest +#SBATCH --account=intertwin +#SBATCH --mail-user= +#SBATCH --mail-type=ALL +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --time=00:30:00 + +# configure node and process count on the CM +#SBATCH --partition=batch +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gpus-per-node=4 + +# SBATCH --exclusive + +# gres options have to be disabled for deepv +#SBATCH --gres=gpu:4 + +# load modules +ml --force purge +ml Stages/2023 StdEnv/2023 NVHPC/23.1 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 Python/3.10.4 HDF5 libaio/0.3.112 GCC/11.3.0 + +# shellcheck source=/dev/null +source ~/.bashrc + +# from repo's root dir +srun micromamba run -p ./.venv-pytorch pytest -v -m slurm tests/ \ No newline at end of file diff --git a/tests/torch/distribtued_decorator.py b/tests/torch/distribtued_decorator.py index 7af056c6..fd086154 100644 --- a/tests/torch/distribtued_decorator.py +++ b/tests/torch/distribtued_decorator.py @@ -15,7 +15,7 @@ import torch.optim as optim from torch.optim.lr_scheduler import StepLR -from itwinai.backend.torch.trainer import distributed +from itwinai.torch.trainer import distributed class Net(nn.Module): diff --git a/tests/torch/test_distribtued_training.py b/tests/torch/test_distribtued_training.py index 9eeaac2e..829fa9a2 100644 --- a/tests/torch/test_distribtued_training.py +++ b/tests/torch/test_distribtued_training.py @@ -7,18 +7,19 @@ @pytest.mark.slurm def test_distributed_decorator(): """Test function decorator. Needs torchrun cmd.""" - cmd = ("micromamba run -p ./ai/.venv-pytorch " + cmd = ("micromamba run -p ./.venv-pytorch " "torchrun --nnodes=1 --nproc_per_node=2 --rdzv_id=100 " "--rdzv_backend=c10d --rdzv_endpoint=localhost:29400 " - "tests/backend/torch/distribtued_decorator.py") + "tests/torch/distribtued_decorator.py") subprocess.run(cmd.split(), check=True) +@pytest.mark.skip(reason="TorchTrainer not implemented yet") @pytest.mark.slurm def test_distributed_trainer(): """Test vanilla torch distributed trainer. Needs torchrun cmd.""" - cmd = ("micromamba run -p ./ai/.venv-pytorch " + cmd = ("micromamba run -p ./.venv-pytorch " "torchrun --nnodes=1 --nproc_per_node=2 --rdzv_id=100 " "--rdzv_backend=c10d --rdzv_endpoint=localhost:29400 " - "tests/backend/torch/torch_dist_trainer.py") + "tests/torch/torch_dist_trainer.py") subprocess.run(cmd.split(), check=True) diff --git a/tests/torch/torch_dist_trainer.py b/tests/torch/torch_dist_trainer.py index e2ba90af..63871f4a 100644 --- a/tests/torch/torch_dist_trainer.py +++ b/tests/torch/torch_dist_trainer.py @@ -11,7 +11,7 @@ from torch.utils.data import DataLoader from torchvision import transforms, datasets -from itwinai.backend.torch.trainer import TorchTrainer +from itwinai.torch.trainer import TorchTrainer class Net(nn.Module): diff --git a/tests/torch/torch_dist_trainer2.py b/tests/torch/torch_dist_trainer2.py deleted file mode 100644 index bfd0d3a3..00000000 --- a/tests/torch/torch_dist_trainer2.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Test Trainer class. To run this script, use the following command: - ->>> torchrun --nnodes=1 --nproc_per_node=2 --rdzv_id=100 --rdzv_backend=c10d \ - --rdzv_endpoint=localhost:29400 test_trainer.py - -""" - -from torch import nn -import torch.nn.functional as F -from torch.utils.data import DataLoader -from torchvision import transforms, datasets - -from itwinai.backend.torch.trainer import TorchTrainer2 - - -class Net(nn.Module): - - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x, dim=0) - - -if __name__ == '__main__': - train_set = datasets.MNIST( - '.tmp/', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - val_set = datasets.MNIST( - '.tmp/', train=False, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - trainer = TorchTrainer2( - model=Net(), - train_dataloader=DataLoader(train_set, batch_size=32, pin_memory=True), - validation_dataloader=DataLoader( - val_set, batch_size=32, pin_memory=True), - strategy='ddp', - backend='nccl', - loss='NLLLoss', - epochs=20, - checkpoint_every=1 - ) - trainer.execute()