From 0cae49b6cf27e339dee57da7c00e902fe8ef911d Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Thu, 21 Nov 2024 15:02:09 +0100 Subject: [PATCH] Update deps --- README.md | 8 ++++---- env-files/tensorflow/Dockerfile | 3 +-- env-files/torch/Dockerfile | 18 ++++++++--------- env-files/torch/createEnvVega.sh | 2 +- env-files/torch/jupyter/Dockerfile | 21 ++++++++++---------- env-files/torch/slim.Dockerfile | 20 +++++++++---------- pyproject.toml | 31 ++++++++++++++---------------- tests/torch/slurm.vega.sh | 2 +- 8 files changed, 50 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 265d03c9..9eb0cb92 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ environment for PyTorch: ```bash ml --force purge - ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3 + ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3 ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0 ``` @@ -80,7 +80,7 @@ environment for TensorFlow: ```bash ml --force purge - ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3 + ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3 ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0 ``` @@ -227,7 +227,7 @@ Commands to be executed before activating the python virtual environment: ```bash ml --force purge - ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3 + ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3 ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0 ``` @@ -261,7 +261,7 @@ Commands to be executed before activating the python virtual environment: ```bash ml --force purge - ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3 + ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3 ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0 ``` diff --git a/env-files/tensorflow/Dockerfile b/env-files/tensorflow/Dockerfile index cc5f7598..1872a18b 100644 --- a/env-files/tensorflow/Dockerfile +++ b/env-files/tensorflow/Dockerfile @@ -33,7 +33,6 @@ RUN apt-get update && apt-get install -y \ RUN pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir \ tf_keras==2.16.* \ - "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \ ray[tune] # Inprove robustness: avoid silent override by Singularity/Apptainer @@ -42,7 +41,7 @@ ENV PYTHONPATH="" # Install itwinai COPY pyproject.toml pyproject.toml COPY src src -RUN pip install --no-cache-dir . \ +RUN pip install --no-cache-dir .[nvidia] \ && itwinai sanity-check --tensorflow --optional-deps ray # Additional pip deps diff --git a/env-files/torch/Dockerfile b/env-files/torch/Dockerfile index a86233e9..d027a8f1 100644 --- a/env-files/torch/Dockerfile +++ b/env-files/torch/Dockerfile @@ -32,7 +32,12 @@ RUN apt-get update && apt-get install -y \ RUN pip install --no-cache-dir --upgrade pip \ && env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py -# DeepSpeed, Horovod and other deps +# Install itwinai with torch +COPY pyproject.toml pyproject.toml +COPY src src +RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124 + +# Install DeepSpeed, Horovod and Ray ENV HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 \ HOROVOD_WITHOUT_MXNET=1 \ @@ -56,21 +61,14 @@ RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)' && pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \ deepspeed==0.15.* \ git+https://github.com/horovod/horovod.git@3a31d93 \ - "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \ ray[tune] \ # fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug && pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)" \ && line=$(cat -n "/usr/local/lib/python${pver}/dist-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py" | grep os.rename | awk '{print $1}' | head -n 1) \ && sed -i "${line}s|^|#|" "/usr/local/lib/python${pver}/dist-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py" -# Install itwinai -COPY pyproject.toml pyproject.toml -COPY src src -# Torch: reuse the global torch in the container -RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)')" \ - && pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \ - .[torch,dev] \ - && itwinai sanity-check --torch \ +# Installation sanity check +RUN itwinai sanity-check --torch \ --optional-deps deepspeed \ --optional-deps horovod \ --optional-deps ray diff --git a/env-files/torch/createEnvVega.sh b/env-files/torch/createEnvVega.sh index f4275fb3..7b859eea 100644 --- a/env-files/torch/createEnvVega.sh +++ b/env-files/torch/createEnvVega.sh @@ -9,7 +9,7 @@ fi # Load modules # NOTE: REFLECT THEM IN THE MAIN README! ml --force purge -ml Python +ml Python/3.11.5-GCCcore-13.2.0 ml CMake/3.24.3-GCCcore-11.3.0 ml mpi4py ml OpenMPI diff --git a/env-files/torch/jupyter/Dockerfile b/env-files/torch/jupyter/Dockerfile index fad0d41e..b296b9cc 100644 --- a/env-files/torch/jupyter/Dockerfile +++ b/env-files/torch/jupyter/Dockerfile @@ -210,9 +210,14 @@ RUN pip install --no-cache-dir --upgrade pip \ 'numpy<2' \ packaging \ py-cpuinfo \ - torch==2.4.* torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \ wheel +# Install itwinai with torch +WORKDIR "$HOME/itwinai" +COPY --chown=${NB_UID} pyproject.toml pyproject.toml +COPY --chown=${NB_UID} src src +RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124 + # Apex: https://github.com/NVIDIA/apex # (needed for DeepSpeed *_FUSED optinal build options) # Note: it will take more than an hour to build @@ -225,7 +230,9 @@ RUN git clone https://github.com/NVIDIA/apex && cd apex \ # Note: it will take about half an hour to build RUN pip install --no-cache-dir transformer_engine[pytorch] -# DeepSpeed, Horovod and other deps +WORKDIR "$HOME/itwinai" + +# Install DeepSpeed, Horovod and Ray ENV HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 \ HOROVOD_WITHOUT_MXNET=1 \ @@ -247,16 +254,10 @@ ENV HOROVOD_WITH_PYTORCH=1 \ RUN pip install --no-cache-dir \ deepspeed==0.15.* \ git+https://github.com/horovod/horovod.git@3a31d93 \ - "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@e99bddba047a1a04ca0387e87cad1ed96d5ab2c5" \ ray[tune] - -# Core itwinai lib -WORKDIR "$HOME/itwinai" -COPY --chown=${NB_UID} pyproject.toml pyproject.toml -COPY --chown=${NB_UID} src src -RUN pip install --no-cache-dir ".[torch,dev]" \ - && itwinai sanity-check --torch \ +# Installation sanity check +RUN itwinai sanity-check --torch \ --optional-deps deepspeed \ --optional-deps horovod \ --optional-deps ray diff --git a/env-files/torch/slim.Dockerfile b/env-files/torch/slim.Dockerfile index fd8e2d01..187046d5 100644 --- a/env-files/torch/slim.Dockerfile +++ b/env-files/torch/slim.Dockerfile @@ -48,26 +48,26 @@ RUN /usr/bin/python3.10 -m venv /opt/venv \ && pip install --no-cache-dir --upgrade pip \ # https://github.com/mpi4py/mpi4py/pull/431 && env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py \ - && pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cu124 \ - "torch==2.4.*" \ - torchvision \ - torchaudio \ + && pip install --no-cache-dir \ # Needed to install horovod wheel +# Install itwinai with torch +WORKDIR /app +COPY pyproject.toml pyproject.toml +COPY src src +RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124 + +# Install DeepSpeed, Horovod and Ray RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)')" \ && pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \ deepspeed==0.15.* \ git+https://github.com/horovod/horovod.git@3a31d93 \ - "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \ ray[tune] -WORKDIR /app -COPY src src -COPY pyproject.toml pyproject.toml -RUN pip install --no-cache-dir .[torch,dev] \ - && itwinai sanity-check --torch \ +# Installation sanity check +RUN itwinai sanity-check --torch \ --optional-deps deepspeed \ --optional-deps horovod \ --optional-deps ray diff --git a/pyproject.toml b/pyproject.toml index e389b5d4..7ce59730 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,11 +44,11 @@ dependencies = [ [project.optional-dependencies] torch = [ - "torch==2.4.*", - "lightning==2.*", - "torchmetrics>=1.6.0", - "torchvision>=0.19.1", - "torchaudio>=2.4.1", + "torch==2.4.*", + "lightning==2.*", + "torchmetrics>=1.6.0", + "torchvision>=0.19.1", + "torchaudio>=2.4.1", ] dev = [ "pytest>=7.4.2", @@ -59,19 +59,16 @@ dev = [ "isort", "tensorflow==2.16.*", # needed by tests on tensorboard ] -docs = [ - "sphinx-rtd-theme==2.0.0", - "nbsphinx==0.9.4", - "myst-parser==2.0.0", - "IPython", - "tensorflow==2.16.*", -] -macos = [ - "prov4ml[apple]@git+https://github.com/matbun/ProvML" -] -linux = [ - "prov4ml[linux]@git+https://github.com/matbun/ProvML" +docs = [ + "sphinx-rtd-theme==2.0.0", + "nbsphinx==0.9.4", + "myst-parser==2.0.0", + "IPython", + "tensorflow==2.16.*", ] +macos = ["prov4ml[apple]@git+https://github.com/matbun/ProvML@new-main"] +nvidia = ["prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main"] +amd = ["prov4ml[amd]@git+https://github.com/matbun/ProvML@new-main"] [project.urls] Homepage = "https://www.intertwin.eu/" diff --git a/tests/torch/slurm.vega.sh b/tests/torch/slurm.vega.sh index 527ebd1e..1b4df076 100644 --- a/tests/torch/slurm.vega.sh +++ b/tests/torch/slurm.vega.sh @@ -42,7 +42,7 @@ echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" ml --force purge # ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3 -ml Python CMake/3.24.3-GCCcore-11.3.0 OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3 +ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3 ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 ml UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0