diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index cfd09b5f..3fa4cea4 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -47,6 +47,9 @@ jobs: VALIDATE_CHECKOV: false # activate to lint k8s pods VALIDATE_SHELL_SHFMT: false VALIDATE_JSCPD: false + VALIDATE_MARKDOWN_PRETTIER: false + VALIDATE_YAML_PRETTIER: false + VALIDATE_PYTHON_PYINK: false # Only check new or edited files VALIDATE_ALL_CODEBASE: false diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 77183ef7..c054c65b 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -35,4 +35,5 @@ sphinx: python: install: # - wheel + - requirements: docs/pre-requirements.txt - requirements: docs/requirements.txt diff --git a/README.md b/README.md index 889de97d..16937e70 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,17 @@ Otherwise, if you are on an HPC system, please refer to [this section](#activate-itwinai-environment-on-hpc) explaining how to load the required environment modules before the python environment. +To build a Docker image for the pytorch version (need to adapt `TAG`): + +```bash +# Local +docker buildx build -t itwinai:TAG -f env-files/torch/Dockerfile . + +# Ghcr.io +docker buildx build -t ghcr.io/intertwin-eu/itwinai:TAG -f env-files/torch/Dockerfile . +docker push ghcr.io/intertwin-eu/itwinai:TAG +``` + #### TensorFlow virtual environment Makefile targets for environment installation: @@ -174,6 +185,17 @@ Otherwise, if you are on an HPC system, please refer to [this section](#activate-itwinai-environment-on-hpc) explaining how to load the required environment modules before the python environment. +To build a Docker image for the tensorflow version (need to adapt `TAG`): + +```bash +# Local +docker buildx build -t itwinai:TAG -f env-files/tensorflow/Dockerfile . + +# Ghcr.io +docker buildx build -t ghcr.io/intertwin-eu/itwinai:TAG -f env-files/tensorflow/Dockerfile . +docker push ghcr.io/intertwin-eu/itwinai:TAG +``` + ### Activate itwinai environment on HPC Usually, HPC systems organize their software in modules which need to be imported by the users diff --git a/docs/README.md b/docs/README.md index a0359602..78f7ef59 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,11 +1,52 @@ # Read The Docs documentation page +The python dependencies are organized in two requirements files, which +must be installed in the following order: + +1. `pre-requirements.txt` contains torch and tensorflow. +1. `requirements.txt` contains the packages which depend on torch and tensorflow, +which should be installed *after* torch and tensorflow. + ## Build docs locally -TODO: explain +To build the docs locally and visualize them in your browser, without relying on external +services (e.g., Read The Docs cloud), use the following commands + +```bash +# Clone the repo, if not done yet +git clone https://github.com/interTwin-eu/itwinai.git itwinai-docs +cd itwinai-docs + +# The first time, you may need to install some Linux packages (assuming Ubuntu system here) +sudo apt update && sudo apt install libmysqlclient-dev +sudo apt install python3-sphinx + +# Create a python virtual environment and install itwinai and its dependencies +python3 -m venv .venv-docs +source .venv-docs/bin/activate +pip install -r docs/pre-requirements.txt +pip install -r docs/requirements.txt +pip install sphinx-rtd-theme + +# Move to the docs folder and build them using Sphinx +cd docs +make clean +make html + +# Serve a local HTTP server to navigate the newly created docs pages. +# You can see the docs visiting http://localhost:8000 in your browser. +python -m http.server --directory _build/html/ +``` ### Build docs on JSC +On JSC systems, the way of building the docs locally is similar to the method +explained above. However, the environment setup must be slightly adapted to use +some modules provided on the HPC system. + +To manage the docs, you can simply use the Makefile target +belows. + From the repository's root, create the docs virtual environment: ```bash @@ -19,7 +60,7 @@ and serve them on localhost: make docs-jsc ``` -## RTD management page +## Read The Docs management page -To manage the documentation page visit +To manage the documentation on Read The Docs (RTD) cloud, visit [https://readthedocs.org/projects/itwinai](https://readthedocs.org/projects/itwinai/). diff --git a/docs/explain_advanced_workflow.rst b/docs/explain_advanced_workflow.rst deleted file mode 100644 index 49418c64..00000000 --- a/docs/explain_advanced_workflow.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. _advanced_workflows: - -DAG Workflows -=================== - -Section :ref:`Workflows` explains how the Pipeline wrapper can drastically simply the implementation of your sequential ML workflows. -However, for non-sequential Directed Acyclic Graph - -.. image:: figures/Advanced_workflow.png - :alt: Diagram of an advanced DAC workflow - :align: center - diff --git a/docs/pre-requirements.txt b/docs/pre-requirements.txt new file mode 100644 index 00000000..c7d4c261 --- /dev/null +++ b/docs/pre-requirements.txt @@ -0,0 +1,5 @@ +wheel +tensorflow==2.16.* +torch==2.1.* +torchvision +torchaudio \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index c1222595..6c75e09d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,12 +1,7 @@ -Sphinx==7.2.6 sphinx-rtd-theme==2.0.0 nbsphinx==0.9.4 myst-parser==2.0.0 -wheel -tensorflow==2.16.* -torch==2.1.* -torchvision -torchaudio + git+https://github.com/thomas-bouvier/horovod.git@compile-cpp17 deepspeed IPython diff --git a/docs/tutorials/distrib-ml/torch-tutorial-containers.rst b/docs/tutorials/distrib-ml/torch-tutorial-containers.rst new file mode 100644 index 00000000..8b310a1d --- /dev/null +++ b/docs/tutorials/distrib-ml/torch-tutorial-containers.rst @@ -0,0 +1,55 @@ +itwinai and containers (Docker and Singularity) +========================= + +In this tutorial you will learn how to use itwinai's containers images to run your ML workflows +without having to setup the python environment by means of virtual environments. + +.. include:: ../../../tutorials/distributed-ml/torch-tutorial-containers/README.md + :parser: myst_parser.sphinx_ + + +Shell scripts +-------------- + +run_docker.sh +++++++++++++++++ +.. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/run_docker.sh + :language: bash + +slurm.sh +++++++++++++ +.. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/slurm.sh + :language: bash + + +runall.sh +++++++++++++++++ +.. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/runall.sh + :language: bash + + +Pipeline configuration +----------------------- + +config.yaml +++++++++++++ + +.. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/config.yaml + :language: yaml + + +Python files +------------------ + +model.py +++++++++++++ + +.. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/model.py + :language: python + +dataloader.py ++++++++++++++++ +.. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/dataloader.py + :language: python + + diff --git a/docs/tutorials/tutorials.rst b/docs/tutorials/tutorials.rst index 6f436f49..2061040e 100644 --- a/docs/tutorials/tutorials.rst +++ b/docs/tutorials/tutorials.rst @@ -20,6 +20,7 @@ Distributed ML with PyTorch distrib-ml/torch_tutorial_2_trainer_class distrib-ml/torch-tutorial-GAN distrib-ml/torch_scaling_test + distrib-ml/torch-tutorial-containers Distributed ML with TensorFlow diff --git a/env-files/docs/create-docs-env-jsc.sh b/env-files/docs/create-docs-env-jsc.sh index 63fcee5b..ec872f15 100644 --- a/env-files/docs/create-docs-env-jsc.sh +++ b/env-files/docs/create-docs-env-jsc.sh @@ -12,4 +12,5 @@ gcc --version rm -rf .venv-docs python -m venv .venv-docs source .venv-docs/bin/activate +pip install -r docs/pre-requirements.txt pip install -r docs/requirements.txt \ No newline at end of file diff --git a/env-files/tensorflow/Dockerfile b/env-files/tensorflow/Dockerfile new file mode 100644 index 00000000..f2dfd551 --- /dev/null +++ b/env-files/tensorflow/Dockerfile @@ -0,0 +1,29 @@ +ARG IMG_TAG=24.08-tf2-py3 + +# 23.09-tf2-py3: tensorflow==2.13.0 +# 24.04-tf2-py3: tensorflow==2.15.0 +# 24.08-tf2-py3: tensorflow==2.16.1 + +FROM nvcr.io/nvidia/tensorflow:${IMG_TAG} + +WORKDIR /usr/src/app + +# Install itwinai +COPY pyproject.toml ./ +COPY src ./ +COPY env-files/tensorflow/create_container_env.sh ./ +RUN bash create_container_env.sh + +# Create non-root user +RUN groupadd -g 10001 jovyan \ + && useradd -m -u 10000 -g jovyan jovyan \ + && chown -R jovyan:jovyan /usr/src/app +USER jovyan:jovyan + +# ENTRYPOINT [ "/bin/sh" ] +# CMD [ ] + +LABEL org.opencontainers.image.source=https://github.com/interTwin-eu/itwinai +LABEL org.opencontainers.image.description="Base itwinai image with tensorflow dependencies and CUDA drivers" +LABEL org.opencontainers.image.licenses=MIT +LABEL maintainer="Matteo Bunino - matteo.bunino@cern.ch" \ No newline at end of file diff --git a/env-files/tensorflow/create_container_env.sh b/env-files/tensorflow/create_container_env.sh new file mode 100644 index 00000000..765eb050 --- /dev/null +++ b/env-files/tensorflow/create_container_env.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Install dependencies in container, assuming that the container image +# is from NGC and tensorflow is already installed + +pip install --no-cache-dir --upgrade pip + +# WHEN USING TF >= 2.16: +# install legacy version of keras (2.16) +# Since TF 2.16, keras updated to 3.3, +# which leads to an error when more than 1 node is used +# https://keras.io/getting_started/ +pip install --no-cache-dir tf_keras==2.16.* + +# itwinai +pip --no-cache-dir install . \ No newline at end of file diff --git a/env-files/tensorflow/generic_tf.sh b/env-files/tensorflow/generic_tf.sh index 4a37c9ca..e514d369 100644 --- a/env-files/tensorflow/generic_tf.sh +++ b/env-files/tensorflow/generic_tf.sh @@ -41,7 +41,7 @@ else echo "$ENV_NAME environment is created in ${cDir}" fi -pip3 install --upgrade pip +pip3 install --no-cache-dir --upgrade pip # get wheel -- setuptools extension pip3 install --no-cache-dir wheel @@ -84,7 +84,7 @@ fi # Since TF 2.16, keras updated to 3.3, # which leads to an error when more than 1 node is used # https://keras.io/getting_started/ -pip3 install tf_keras +pip3 install --no-cache-dir tf_keras==2.16.* # itwinai -pip3 install -e .[dev] +pip3 install --no-cache-dir -e .[dev] diff --git a/env-files/torch/Dockerfile b/env-files/torch/Dockerfile new file mode 100644 index 00000000..b97ab2c8 --- /dev/null +++ b/env-files/torch/Dockerfile @@ -0,0 +1,31 @@ +ARG IMG_TAG=23.09-py3 + +# 23.09-py3: torch==2.1.0 +# 24.04-py3: torch==2.3.0 + +FROM nvcr.io/nvidia/pytorch:${IMG_TAG} + +# https://stackoverflow.com/a/56748289 +ARG IMG_TAG + +WORKDIR /usr/src/app + +# https://github.com/mpi4py/mpi4py/pull/431 +RUN env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py + +# Install itwinai +COPY pyproject.toml ./ +COPY src ./ +COPY env-files/torch/create_container_env.sh ./ +RUN bash create_container_env.sh ${IMG_TAG} + +# Create non-root user +RUN groupadd -g 10001 jovyan \ + && useradd -m -u 10000 -g jovyan jovyan \ + && chown -R jovyan:jovyan /usr/src/app +USER jovyan:jovyan + +LABEL org.opencontainers.image.source=https://github.com/interTwin-eu/itwinai +LABEL org.opencontainers.image.description="Base itwinai image with torch dependencies and CUDA drivers" +LABEL org.opencontainers.image.licenses=MIT +LABEL maintainer="Matteo Bunino - matteo.bunino@cern.ch" \ No newline at end of file diff --git a/env-files/torch/create_container_env.sh b/env-files/torch/create_container_env.sh new file mode 100644 index 00000000..f58fa54c --- /dev/null +++ b/env-files/torch/create_container_env.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Install dependencies in container, assuming that the container image +# is from NGC and pytorch is already installed. + +if [ -z "$1" ]; then + echo "ERROR: NGC tag not specified" + exit 2 +fi + +if [ "$1" == "23.09-py3" ]; then + # Tested for torch==2.1.0 + + pip install --no-cache-dir --upgrade pip + # pip install --no-cache-dir lightning torchmetrics wheel ray ray[tune] + + # DeepSpeed + + # export DS_BUILD_CCL_COMM=1 # temporarily disabled. + # To install it see: + # https://github.com/intel/torch-ccl?tab=readme-ov-file#install-prebuilt-wheel + # https://github.com/oneapi-src/oneCCL?tab=readme-ov-file#installation + + export DS_BUILD_UTILS=1 + export DS_BUILD_AIO=1 + export DS_BUILD_FUSED_ADAM=1 + export DS_BUILD_FUSED_LAMB=1 + export DS_BUILD_TRANSFORMER=1 + export DS_BUILD_STOCHASTIC_TRANSFORMER=1 + export DS_BUILD_TRANSFORMER_INFERENCE=1 + pip3 install --no-cache-dir DeepSpeed + + if [ $? -ne 0 ]; then + echo "DeepSpeed installation FAILED" + exit 2 + fi + + # fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug + pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)" + line=$(cat -n /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py | grep os.rename | awk '{print $1}' | head -n 1) + sed -i "${line}s|^|#|" /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py + + # Horovod + # compiler vars + export LDSHARED="$CC -shared" && + export CMAKE_CXX_STANDARD=17 + + # CPU vars + export HOROVOD_MPI_THREADS_DISABLE=1 + export HOROVOD_CPU_OPERATIONS=MPI + + # GPU vars + export HOROVOD_GPU_ALLREDUCE=NCCL + export HOROVOD_NCCL_LINK=SHARED + export HOROVOD_NCCL_HOME=$EBROOTNCCL + + # Host language vars + export HOROVOD_WITH_PYTORCH=1 + export HOROVOD_WITHOUT_TENSORFLOW=1 + export HOROVOD_WITHOUT_MXNET=1 + + # Fix needed to compile horovod with torch >= 2.1 + # https://github.com/horovod/horovod/pull/3998 + # Assume that Horovod env vars are already in the current env! + pip3 install --no-cache-dir git+https://github.com/thomas-bouvier/horovod.git@compile-cpp17 + + if [ $? -ne 0 ]; then + echo "Horovod installation FAILED" + exit 2 + fi + + # Install itwinai + # $(python -c 'import torch;print(torch.__version__)') serves to enforce that the current version of + # torch in the container is preserved, otherwise, if updated, Horovod will complain. + pip install .[torch] torch==$(python -c 'import torch;print(torch.__version__)') --no-cache-dir + +else + echo "ERROR: unrecognized tag." + exit 2 +fi \ No newline at end of file diff --git a/env-files/torch/generic_torch.sh b/env-files/torch/generic_torch.sh index c9cdaa42..1d7e639b 100644 --- a/env-files/torch/generic_torch.sh +++ b/env-files/torch/generic_torch.sh @@ -46,7 +46,7 @@ else echo "$ENV_NAME environment is created in ${cDir}" fi -pip install --upgrade pip +pip install --no-cache-dir --upgrade pip pip install --no-cache-dir packaging wheel @@ -97,7 +97,7 @@ else pip install --no-cache-dir DeepSpeed else # CPU only installation - pip install deepspeed + pip install --no-cache-dir deepspeed fi # fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug @@ -171,4 +171,4 @@ else fi # Install itwinai -pip install -e .[dev,torch] +pip install --no-cache-dir -e .[torch,dev] diff --git a/pyproject.toml b/pyproject.toml index f7e25ad3..59d96f67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,10 @@ dependencies = [ # dynamic = ["version", "description"] [project.optional-dependencies] -torch = ["lightning", "torchmetrics"] + +# Torch should be already installed, but it is specified here to +# prevent itwinai dependencies from cheing its version. +torch = ["torch==2.1.*", "lightning==2.*", "torchmetrics"] # torch-cpu = [ # "torch==2.1.*", # "torchvision", diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py index b0aca6a3..e51d7a61 100644 --- a/src/itwinai/loggers.py +++ b/src/itwinai/loggers.py @@ -270,6 +270,40 @@ def should_log( return True +class _EmptyLogger(Logger): + """Dummy logger which can be used as a placeholder when a real logger is + not available. All methods do nothing. + """ + + def __init__( + self, + savedir: str = 'mllogs', + log_freq: int | Literal['epoch'] | Literal['batch'] = 'epoch', + log_on_workers: int | List[int] = 0 + ) -> None: + super().__init__(savedir, log_freq, log_on_workers) + + def create_logger_context(self, rank: Optional[int] = None): + pass + + def destroy_logger_context(self): + pass + + def save_hyperparameters(self, params: Dict[str, Any]) -> None: + pass + + def log( + self, + item: Union[Any, List[Any]], + identifier: Union[str, List[str]], + kind: str = 'metric', + step: Optional[int] = None, + batch_idx: Optional[int] = None, + **kwargs + ) -> None: + pass + + class ConsoleLogger(Logger): """Simplified logger. diff --git a/tests/run_on_jsc.sh b/tests/run_on_jsc.sh index 48006820..aeea9197 100644 --- a/tests/run_on_jsc.sh +++ b/tests/run_on_jsc.sh @@ -21,8 +21,8 @@ if [ ! -d "$TF_ENV" ]; then fi # Avoid downloading datasets from Gdrive -export CERN_DATASET="/p/project/intertwin/smalldata/3dgan-sample" -export CMCCC_DATASET="/p/project/intertwin/smalldata/cmcc" -export MNIST_DATASET="/p/project/intertwin/smalldata/mnist" +export CERN_DATASET="/p/project1/intertwin/smalldata/3dgan-sample" +export CMCCC_DATASET="/p/project1/intertwin/smalldata/cmcc" +export MNIST_DATASET="/p/project1/intertwin/smalldata/mnist" $TORCH_ENV/bin/pytest -v tests/ -m "not slurm" \ No newline at end of file diff --git a/tests/test_loggers.py b/tests/test_loggers.py index 5ddd37f4..06d737a6 100644 --- a/tests/test_loggers.py +++ b/tests/test_loggers.py @@ -11,7 +11,7 @@ LoggersCollection, Prov4MLLogger ) -from prov4ml.provenance.context import Context +# from prov4ml.provenance.context import Context @pytest.fixture(scope="module") @@ -172,41 +172,41 @@ def test_loggers_collection_log(loggers_collection): loggers_collection.destroy_logger_context() -@patch('prov4ml.start_run') -def test_create_logger_context(mock_start_run, prov4ml_logger): - prov4ml_logger.create_logger_context() - mock_start_run.assert_called_once_with( - prov_user_namespace="www.example.org", - experiment_name="experiment_name", - provenance_save_dir="prov", - save_after_n_logs=100, - collect_all_processes=False - ) - - -@patch('prov4ml.end_run') -def test_destroy_logger_context(mock_end_run, prov4ml_logger): - prov4ml_logger.destroy_logger_context() - mock_end_run.assert_called_once_with(create_graph=True, create_svg=True) - - -@patch('prov4ml.log_metric') -def test_log_metric(mock_log_metric, prov4ml_logger): - item = 1.0 - identifier = "test_metric" - context = MagicMock(spec=Context) - prov4ml_logger.log( - item, identifier, kind='metric', step=0, context=context) - mock_log_metric.assert_called_once_with(identifier, item, context, step=0) - - -@patch('prov4ml.log_flops_per_batch') -def test_log_flops_per_batch(mock_log_flops_per_batch, prov4ml_logger): - item = (MagicMock(), MagicMock()) - identifier = "test_flops" - context = MagicMock(spec=Context) - prov4ml_logger.log( - item, identifier, kind='flops_pb', step=0, - context=context) - mock_log_flops_per_batch.assert_called_once_with( - identifier, model=item[0], batch=item[1], context=context, step=0) +# @patch('prov4ml.start_run') +# def test_create_logger_context(mock_start_run, prov4ml_logger): +# prov4ml_logger.create_logger_context() +# mock_start_run.assert_called_once_with( +# prov_user_namespace="www.example.org", +# experiment_name="experiment_name", +# provenance_save_dir="prov", +# save_after_n_logs=100, +# collect_all_processes=False +# ) + + +# @patch('prov4ml.end_run') +# def test_destroy_logger_context(mock_end_run, prov4ml_logger): +# prov4ml_logger.destroy_logger_context() +# mock_end_run.assert_called_once_with(create_graph=True, create_svg=True) + + +# @patch('prov4ml.log_metric') +# def test_log_metric(mock_log_metric, prov4ml_logger): +# item = 1.0 +# identifier = "test_metric" +# context = MagicMock(spec=Context) +# prov4ml_logger.log( +# item, identifier, kind='metric', step=0, context=context) +# mock_log_metric.assert_called_once_with(identifier, item, context, step=0) + + +# @patch('prov4ml.log_flops_per_batch') +# def test_log_flops_per_batch(mock_log_flops_per_batch, prov4ml_logger): +# item = (MagicMock(), MagicMock()) +# identifier = "test_flops" +# context = MagicMock(spec=Context) +# prov4ml_logger.log( +# item, identifier, kind='flops_pb', step=0, +# context=context) +# mock_log_flops_per_batch.assert_called_once_with( +# identifier, model=item[0], batch=item[1], context=context, step=0) diff --git a/tutorials/distributed-ml/torch-tutorial-containers/README.md b/tutorials/distributed-ml/torch-tutorial-containers/README.md new file mode 100644 index 00000000..9400f607 --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-containers/README.md @@ -0,0 +1,83 @@ +# Using itwinai container to run training on MNIST + +In this tutorial we show how to use containers to run machine learning workflows with itwinai. + +Container images are pulled from the GHCR associated with our GH repository and are Docker +images. Although Docker is generally not supported in HPC environments, Singularity supports +Docker images and is able to convert them to Singularity images (SIF files) upon pull. + +The examples will showcase training of a simple neural network defined in `model.py` on the +MNIST benchmark dataset. The ML workflow is defined using itwinai `Pipeline`, the training +algorithm is implemented by the itwinai `TorchTrainer`, and the training parameters are +defined in `config.yaml`. + +In this tutorial we are using general purpose itwinai container images to execute a use case code. +This is possible when the use case does not depend on additional packages not included in the container +image. If you want to add dependencies, you need to create a new container image using itwinai as +base image. A minimal example of a custom Dockerfile: + +```dockerfile +FROM ghcr.io/intertwin-eu/itwinai:0.2.2-torch-2.1 +RUN pip install --no-cache-dir PYTHON_PACKAGE +``` + +## Docker (non-HPC environments) + +When executing a Docker container, you need to explicitly mount the current working directory +in the container, making it possible for the script executed in the container to use existing +files and create new files in the current directory (on in another location). This can be achieved +by bind mounting the current working directory in some location in the container, and moving to +that location in the container before executing the desired command. + +```bash +bash run_docker.sh +``` + +The script above runs the following command in the itwinai torch container +in this folder: + +```bash +itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline +``` + +## Singularity (HPC environments) + +With singularity there is no need to explicitly bind mount the current working directory (CWD) in the container +as this is already done automatically by Singularity. Moreover, the CWD inside the container *coincides* +with the CWD outside the container, not requiring to change directory before executing the command inside +the container. However, differently from Docker, Singularity does +not automatically allow to write in locations inside the container. It is therefore suggested to save +results in the CWD, or in other locations mounted in the container. + +First of all, pull the Docker image and convert it to a Singularity image: + +```bash +# If needed, remove existing Singularity image before proceeding +rm -rf itwinai_torch.sif + +# Pull Docker image and convert it to Singularity on login node +singularity pull itwinai_torch.sif docker://ghcr.io/intertwin-eu/itwinai:0.2.2-torch-2.1 +``` + +Before running distributed ML on the computing node of some HPC cluster, make sure to download +the dataset as usually there is not internet connection on compute nodes: + +```bash +# Run only the first step on the HPC login node, which downloads the datasets if not present +singularity run itwinai_torch.sif /bin/bash -c \ + "itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline --steps dataloading_step" +``` + +Now run distributed ML on multiple compute nodes using both Torch DDP and Microsoft DeepSpeed: + +```bash +# Run on distributed ML job (torch DDP is the default one) +sbatch slurm.sh + +# Alternatively, run all distributed jobs +bash runall.sh +``` + +> [!NOTE] +> Please note that at the moment Horovod distributed training using containerized environments +> is not supported. diff --git a/tutorials/distributed-ml/torch-tutorial-containers/config.yaml b/tutorials/distributed-ml/torch-tutorial-containers/config.yaml new file mode 100644 index 00000000..c23c1c3a --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-containers/config.yaml @@ -0,0 +1,71 @@ +# General config +dataset_root: .tmp/ +num_classes: 10 +batch_size: 64 +num_workers_dataloader: 4 +pin_memory: False +lr: 0.001 +momentum: 0.9 +fp16_allreduce: False +use_adasum: False +gradient_predivide_factor: 1.0 +epochs: 2 +strategy: ddp +test_data_path: mnist-sample-data +inference_model_mlflow_uri: mnist-pre-trained.pth +predictions_dir: mnist-predictions +predictions_file: predictions.csv +class_labels: null + +# Workflows configuration +training_pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + steps: + dataloading_step: + class_path: dataloader.MNISTDataModuleTorch + init_args: + save_path: ${dataset_root} + + training_step: + class_path: itwinai.torch.trainer.TorchTrainer + init_args: + config: + batch_size: ${batch_size} + num_workers: ${num_workers_dataloader} + pin_memory: ${pin_memory} + lr: ${lr} + momentum: ${momentum} + fp16_allreduce: ${fp16_allreduce} + use_adasum: ${use_adasum} + gradient_predivide_factor: ${gradient_predivide_factor} + + model: + class_path: model.Net + epochs: ${epochs} + metrics: + accuracy: + class_path: torchmetrics.classification.MulticlassAccuracy + init_args: + num_classes: ${num_classes} + precision: + class_path: torchmetrics.classification.MulticlassPrecision + init_args: + num_classes: ${num_classes} + recall: + class_path: torchmetrics.classification.MulticlassRecall + init_args: + num_classes: ${num_classes} + logger: + class_path: itwinai.loggers.LoggersCollection + init_args: + loggers: + - class_path: itwinai.loggers.ConsoleLogger + init_args: + log_freq: 10000 + - class_path: itwinai.loggers.MLFlowLogger + init_args: + experiment_name: MNIST classifier + log_freq: batch + strategy: ${strategy} + # checkpoint_every: 1 diff --git a/tutorials/distributed-ml/torch-tutorial-containers/dataloader.py b/tutorials/distributed-ml/torch-tutorial-containers/dataloader.py new file mode 100644 index 00000000..a19c647e --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-containers/dataloader.py @@ -0,0 +1,120 @@ +"""Dataloader for Torch-based MNIST use case.""" + +from typing import Optional, Tuple, Callable, Any +import os +import shutil + +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms, datasets + +from itwinai.components import DataGetter, monitor_exec + + +class MNISTDataModuleTorch(DataGetter): + """Download MNIST dataset for torch.""" + + def __init__(self, save_path: str = '.tmp/',) -> None: + super().__init__() + self.save_parameters(**self.locals2params(locals())) + self.save_path = save_path + + @monitor_exec + def execute(self) -> Tuple[Dataset, Dataset]: + train_dataset = datasets.MNIST( + self.save_path, train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + validation_dataset = datasets.MNIST( + self.save_path, train=False, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + print("Train and validation datasets loaded.") + return train_dataset, validation_dataset, None + + +class InferenceMNIST(Dataset): + """Loads a set of MNIST images from a folder of JPG files.""" + + def __init__( + self, + root: str, + transform: Optional[Callable] = None, + supported_format: str = '.jpg' + ) -> None: + self.root = root + self.transform = transform + self.supported_format = supported_format + self.data = dict() + self._load() + + def _load(self): + for img_file in os.listdir(self.root): + if not img_file.lower().endswith(self.supported_format): + continue + filename = os.path.basename(img_file) + img = Image.open(os.path.join(self.root, img_file)) + self.data[filename] = img + + def __len__(self) -> int: + return len(self.data) + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Args: + index (int): Index + + Returns: + tuple: (image_identifier, image) where image_identifier + is the unique identifier for the image (e.g., filename). + """ + img_id, img = list(self.data.items())[index] + + if self.transform is not None: + img = self.transform(img) + + return img_id, img + + @staticmethod + def generate_jpg_sample( + root: str, + max_items: int = 100 + ): + """Generate a sample dataset of JPG images starting from + LeCun's test dataset. + + Args: + root (str): sample path on disk + max_items (int, optional): max number of images to + generate. Defaults to 100. + """ + if os.path.exists(root): + shutil.rmtree(root) + os.makedirs(root) + + test_data = datasets.MNIST(root='.tmp', train=False, download=True) + for idx, (img, _) in enumerate(test_data): + if idx >= max_items: + break + savepath = os.path.join(root, f'digit_{idx}.jpg') + img.save(savepath) + + +class MNISTPredictLoader(DataGetter): + def __init__(self, test_data_path: str) -> None: + super().__init__() + self.save_parameters(**self.locals2params(locals())) + self.test_data_path = test_data_path + + @monitor_exec + def execute(self) -> Dataset: + return InferenceMNIST( + root=self.test_data_path, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) diff --git a/tutorials/distributed-ml/torch-tutorial-containers/model.py b/tutorials/distributed-ml/torch-tutorial-containers/model.py new file mode 100644 index 00000000..759cec87 --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-containers/model.py @@ -0,0 +1,22 @@ +from torch import nn +import torch.nn.functional as F + + +class Net(nn.Module): + + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x, dim=0) diff --git a/tutorials/distributed-ml/torch-tutorial-containers/run_docker.sh b/tutorials/distributed-ml/torch-tutorial-containers/run_docker.sh new file mode 100644 index 00000000..df963a15 --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-containers/run_docker.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +CMD="itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline" + +# Run command in the itwinai torch Docker container +if [ -z "$1" ]; then + # CPU only execution + docker run -it --rm --name mnist-training --user $UID:$GID \ + --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ + -v "$PWD":/use-case ghcr.io/intertwin-eu/itwinai:0.0.1-torch-2.1 \ + /bin/bash -c "cd /use-case && $CMD" +elif [ "$1" == "gpu" ]; then + # With GPU support: --gpus all + docker run -it --rm --name mnist-training --user $UID:$GID \ + --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ + -v "$PWD":/use-case ghcr.io/intertwin-eu/itwinai:0.0.1-torch-2.1 \ + /bin/bash -c "cd /use-case && $CMD" +fi \ No newline at end of file diff --git a/tutorials/distributed-ml/torch-tutorial-containers/runall.sh b/tutorials/distributed-ml/torch-tutorial-containers/runall.sh new file mode 100644 index 00000000..68b88c65 --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-containers/runall.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Clear SLURM logs (*.out and *.err files) +rm -rf logs_slurm +mkdir logs_slurm +rm -rf logs_torchrun + +# DDP itwinai +DIST_MODE="ddp" +RUN_NAME="ddp-itwinai" +TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=ddp' +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh + +# DeepSpeed itwinai +DIST_MODE="deepspeed" +RUN_NAME="deepspeed-itwinai" +TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=deepspeed' +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh + +# # Horovod itwinai +# DIST_MODE="horovod" +# RUN_NAME="horovod-itwinai" +# TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=horovod' +# sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \ +# --job-name="$RUN_NAME-n$N" \ +# --output="logs_slurm/job-$RUN_NAME-n$N.out" \ +# --error="logs_slurm/job-$RUN_NAME-n$N.err" \ +# slurm.sh \ No newline at end of file diff --git a/tutorials/distributed-ml/torch-tutorial-containers/slurm.sh b/tutorials/distributed-ml/torch-tutorial-containers/slurm.sh new file mode 100644 index 00000000..2d094a7c --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-containers/slurm.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# SLURM jobscript for JSC systems + +# Job configuration +#SBATCH --job-name=distributed_training +#SBATCH --account=intertwin +#SBATCH --mail-user= +#SBATCH --mail-type=ALL +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --time=00:30:00 + +# Resources allocation +#SBATCH --partition=batch +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=4 +#SBATCH --cpus-per-gpu=4 +#SBATCH --exclusive + +# gres options have to be disabled for deepv +#SBATCH --gres=gpu:4 + +# Load environment modules +# ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +ml --force purge + +# Job info +echo "DEBUG: TIME: $(date)" +sysN="$(uname -n | cut -f2- -d.)" +sysN="${sysN%%[0-9]*}" +echo "Running on system: $sysN" +echo "DEBUG: EXECUTE: $EXEC" +echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" +echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" +echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" +echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" +echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" +echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" +echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" +echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" +echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +if [ "$DEBUG" = true ] ; then + echo "DEBUG: NCCL_DEBUG=INFO" + export NCCL_DEBUG=INFO +fi +echo + +# Setup env for distributed ML +export CUDA_VISIBLE_DEVICES="0,1,2,3" +export OMP_NUM_THREADS=1 +if [ "$SLURM_CPUS_PER_GPU" -gt 0 ] ; then + export OMP_NUM_THREADS=$SLURM_CPUS_PER_GPU +fi + +# Env vairables check +if [ -z "$DIST_MODE" ]; then + >&2 echo "WARNING: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'. Using 'ddp'." + DIST_MODE='ddp' +fi +if [ -z "$RUN_NAME" ]; then + >&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment." + RUN_NAME=$DIST_MODE +fi +if [ -z "$TRAINING_CMD" ]; then + >&2 echo "WARNING: env variable TRAINING_CMD is not set. It's the python command to execute." + TRAINING_CMD='$(/usr/bin/which itwinai) exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=ddp' + >&2 echo "setting TRAINING_CMD=$TRAINING_CMD" +fi + +# Get GPUs info per node +# srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"' + +# Launch training +if [ "$DIST_MODE" == "ddp" ] ; then + echo "DDP training: $TRAINING_CMD" + srun --cpu-bind=none --ntasks-per-node=1 \ + singularity run --nv itwinai_torch.sif /bin/bash -c \ + "torchrun \ + --log_dir='logs_torchrun' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + $TRAINING_CMD" +elif [ "$DIST_MODE" == "deepspeed" ] ; then + echo "DEEPSPEED training: $TRAINING_CMD" + srun --cpu-bind=none --ntasks-per-node=1 \ + singularity run --nv itwinai_torch.sif /bin/bash -c \ + "torchrun \ + --log_dir='logs_torchrun' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + $TRAINING_CMD" + + # MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i + # export MASTER_ADDR + # export MASTER_PORT=29500 + + # srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ + # --mpi=pmi2 singularity run --nv \ + # --env MASTER_ADDR=$MASTER_ADDR,MASTER_PORT=$MASTER_PORT \ + # itwinai_torch.sif /bin/bash -c "$TRAINING_CMD" + +elif [ "$DIST_MODE" == "horovod" ] ; then + echo "Horovod is not currently supported in conjuction with containers" + exit 2 + + # echo "HOROVOD training: $TRAINING_CMD" + # srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ + # --mpi=pmix singularity run --nv itwinai_torch.sif \ + # /bin/bash -c "$TRAINING_CMD" +else + >&2 echo "ERROR: unrecognized \$DIST_MODE env variable" + exit 1 +fi diff --git a/use-cases/3dgan/config.yaml b/use-cases/3dgan/config.yaml index 7d786e42..fab6312f 100644 --- a/use-cases/3dgan/config.yaml +++ b/use-cases/3dgan/config.yaml @@ -1,6 +1,6 @@ # Main configurations dataset_location: exp_data/ -dataset_url: https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX +dataset_url: https://drive.google.com/drive/folders/1ooUIfkhpokvwh4-7qPxX084N7-LgqnIL hw_accelerators: auto distributed_strategy: ddp_find_unused_parameters_true #deepspeed auto horovod devices: auto #[0] @@ -11,7 +11,7 @@ batch_size: 4 train_dataset_proportion: 0.7 num_workers_dataloader: 0 max_epochs: 2 -max_dataset_size: 10000 +max_dataset_size: 48 random_seed: 4231162351 inference_results_location: 3dgan-generated-data/ inference_model_uri: 3dgan-inference.pth @@ -36,26 +36,28 @@ training_pipeline: training_step: class_path: trainer.Lightning3DGANTrainer init_args: - itwinai_logger: - class_path: itwinai.loggers.LoggersCollection - init_args: - loggers: - # - class_path: itwinai.loggers.ConsoleLogger - # init_args: - # log_freq: 100 - # - class_path: itwinai.loggers.MLFlowLogger - # init_args: - # experiment_name: 3DGAN - # log_freq: batch - - class_path: itwinai.loggers.Prov4MLLogger - init_args: - provenance_save_dir: mllogs/prov_logs - experiment_name: 3DGAN - log_freq: batch - log_on_workers: -1 - # - class_path: itwinai.loggers.WandBLogger - # init_args: - # log_freq: batch + # NOTE: before pushing to the repo, disable logging to prevent slowing down unit tests + # itwinai_logger: + # class_path: itwinai.loggers.LoggersCollection + # init_args: + # loggers: + # # - class_path: itwinai.loggers.ConsoleLogger + # # init_args: + # # log_freq: 100 + # # - class_path: itwinai.loggers.MLFlowLogger + # # init_args: + # # experiment_name: 3DGAN + # # log_freq: batch + # - class_path: itwinai.loggers.Prov4MLLogger + # init_args: + # provenance_save_dir: mllogs/prov_logs + # experiment_name: 3DGAN + # log_freq: batch + # log_on_workers: -1 + # # - class_path: itwinai.loggers.WandBLogger + # # init_args: + # # log_freq: batch + # Pytorch lightning config for training config: seed_everything: ${random_seed} diff --git a/use-cases/3dgan/downsample_h5py_file.py b/use-cases/3dgan/downsample_h5py_file.py new file mode 100644 index 00000000..7fa37289 --- /dev/null +++ b/use-cases/3dgan/downsample_h5py_file.py @@ -0,0 +1,27 @@ +""" +Downsample H5 files to a more manageable size. +""" + +import h5py + +IN_FILENAME = 'large_file.h5' +OUT_FILENAME = 'sample.h5' +MAXITEMS = 100 + +with h5py.File(IN_FILENAME, 'r') as input_file: + with h5py.File(OUT_FILENAME, 'w') as outfile: + for key in input_file.keys(): + print(input_file[key]) + shape = list(input_file[key].shape) + shape[0] = MAXITEMS + outfile.create_dataset_like( + name=key, + other=input_file[key], + shape=tuple(shape) + ) + print(outfile[key]) + outfile[key][...] = input_file[key][:MAXITEMS] + + print("verify similarities") + print(input_file['energy'][:10]) + print(outfile['energy'][:10]) diff --git a/use-cases/3dgan/trainer.py b/use-cases/3dgan/trainer.py index a2f5fba1..43c25e55 100644 --- a/use-cases/3dgan/trainer.py +++ b/use-cases/3dgan/trainer.py @@ -19,7 +19,7 @@ # init_lightning_mlflow, # teardown_lightning_mlflow # ) -from itwinai.loggers import Logger +from itwinai.loggers import Logger, _EmptyLogger from model import ThreeDGAN @@ -38,7 +38,9 @@ def __init__( # Load from YAML config = load_yaml(config) self.conf = config - self.itwinai_logger = itwinai_logger + self.itwinai_logger = ( + itwinai_logger if itwinai_logger else _EmptyLogger() + ) @monitor_exec def execute(self) -> Any: