From 0cae49b6cf27e339dee57da7c00e902fe8ef911d Mon Sep 17 00:00:00 2001
From: Matteo Bunino <matteo.bunino@gmail.com>
Date: Thu, 21 Nov 2024 15:02:09 +0100
Subject: [PATCH] Update deps

---
 README.md                          |  8 ++++----
 env-files/tensorflow/Dockerfile    |  3 +--
 env-files/torch/Dockerfile         | 18 ++++++++---------
 env-files/torch/createEnvVega.sh   |  2 +-
 env-files/torch/jupyter/Dockerfile | 21 ++++++++++----------
 env-files/torch/slim.Dockerfile    | 20 +++++++++----------
 pyproject.toml                     | 31 ++++++++++++++----------------
 tests/torch/slurm.vega.sh          |  2 +-
 8 files changed, 50 insertions(+), 55 deletions(-)

diff --git a/README.md b/README.md
index 265d03c9..9eb0cb92 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ environment for PyTorch:
 
     ```bash
     ml --force purge
-    ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
+    ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
     ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
     ```
 
@@ -80,7 +80,7 @@ environment for TensorFlow:
 
     ```bash
     ml --force purge
-    ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
+    ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
     ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
     ```
 
@@ -227,7 +227,7 @@ Commands to be executed before activating the python virtual environment:
 
     ```bash
     ml --force purge
-    ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
+    ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
     ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
     ```
 
@@ -261,7 +261,7 @@ Commands to be executed before activating the python virtual environment:
 
     ```bash
     ml --force purge
-    ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
+    ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
     ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
     ```
 
diff --git a/env-files/tensorflow/Dockerfile b/env-files/tensorflow/Dockerfile
index cc5f7598..1872a18b 100644
--- a/env-files/tensorflow/Dockerfile
+++ b/env-files/tensorflow/Dockerfile
@@ -33,7 +33,6 @@ RUN apt-get update && apt-get install -y \
 RUN pip install --no-cache-dir --upgrade pip \
     && pip install --no-cache-dir \
     tf_keras==2.16.* \
-    "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \
     ray[tune]
 
 # Inprove robustness: avoid silent override by Singularity/Apptainer
@@ -42,7 +41,7 @@ ENV PYTHONPATH=""
 # Install itwinai
 COPY pyproject.toml pyproject.toml
 COPY src src
-RUN pip install --no-cache-dir . \
+RUN pip install --no-cache-dir .[nvidia] \
     && itwinai sanity-check --tensorflow --optional-deps ray
 
 # Additional pip deps
diff --git a/env-files/torch/Dockerfile b/env-files/torch/Dockerfile
index a86233e9..d027a8f1 100644
--- a/env-files/torch/Dockerfile
+++ b/env-files/torch/Dockerfile
@@ -32,7 +32,12 @@ RUN apt-get update && apt-get install -y \
 RUN pip install --no-cache-dir --upgrade pip \
     && env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py
 
-# DeepSpeed, Horovod and other deps
+# Install itwinai with torch
+COPY pyproject.toml pyproject.toml
+COPY src src
+RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124
+
+# Install DeepSpeed, Horovod and Ray
 ENV HOROVOD_WITH_PYTORCH=1 \
     HOROVOD_WITHOUT_TENSORFLOW=1 \
     HOROVOD_WITHOUT_MXNET=1 \
@@ -56,21 +61,14 @@ RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)'
     && pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \
     deepspeed==0.15.* \
     git+https://github.com/horovod/horovod.git@3a31d93 \
-    "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \
     ray[tune] \
     # fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug
     && pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)" \
     && line=$(cat -n "/usr/local/lib/python${pver}/dist-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py" | grep os.rename | awk '{print $1}' | head -n 1) \
     && sed -i "${line}s|^|#|" "/usr/local/lib/python${pver}/dist-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py"
 
-# Install itwinai
-COPY pyproject.toml pyproject.toml
-COPY src src
-# Torch: reuse the global torch in the container
-RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)')" \
-    && pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \
-    .[torch,dev] \
-    && itwinai sanity-check --torch \
+# Installation sanity check
+RUN itwinai sanity-check --torch \
     --optional-deps deepspeed \
     --optional-deps horovod \
     --optional-deps ray
diff --git a/env-files/torch/createEnvVega.sh b/env-files/torch/createEnvVega.sh
index f4275fb3..7b859eea 100644
--- a/env-files/torch/createEnvVega.sh
+++ b/env-files/torch/createEnvVega.sh
@@ -9,7 +9,7 @@ fi
 # Load modules
 # NOTE: REFLECT THEM IN THE MAIN README! 
 ml --force purge
-ml Python 
+ml Python/3.11.5-GCCcore-13.2.0 
 ml CMake/3.24.3-GCCcore-11.3.0
 ml mpi4py
 ml OpenMPI
diff --git a/env-files/torch/jupyter/Dockerfile b/env-files/torch/jupyter/Dockerfile
index fad0d41e..b296b9cc 100644
--- a/env-files/torch/jupyter/Dockerfile
+++ b/env-files/torch/jupyter/Dockerfile
@@ -210,9 +210,14 @@ RUN pip install --no-cache-dir --upgrade pip \
     'numpy<2' \
     packaging \
     py-cpuinfo \
-    torch==2.4.* torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \
     wheel
 
+# Install itwinai with torch
+WORKDIR "$HOME/itwinai"
+COPY --chown=${NB_UID} pyproject.toml pyproject.toml
+COPY --chown=${NB_UID} src src
+RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124
+
 # Apex: https://github.com/NVIDIA/apex
 # (needed for DeepSpeed *_FUSED optinal build options)
 # Note: it will take more than an hour to build
@@ -225,7 +230,9 @@ RUN git clone https://github.com/NVIDIA/apex && cd apex \
 # Note: it will take about half an hour to build
 RUN pip install --no-cache-dir transformer_engine[pytorch]
 
-# DeepSpeed, Horovod and other deps
+WORKDIR "$HOME/itwinai"
+
+# Install DeepSpeed, Horovod and Ray
 ENV HOROVOD_WITH_PYTORCH=1 \
     HOROVOD_WITHOUT_TENSORFLOW=1 \
     HOROVOD_WITHOUT_MXNET=1 \
@@ -247,16 +254,10 @@ ENV HOROVOD_WITH_PYTORCH=1 \
 RUN pip install --no-cache-dir \
     deepspeed==0.15.* \
     git+https://github.com/horovod/horovod.git@3a31d93 \
-    "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@e99bddba047a1a04ca0387e87cad1ed96d5ab2c5" \
     ray[tune]
 
-
-# Core itwinai lib
-WORKDIR "$HOME/itwinai"
-COPY --chown=${NB_UID} pyproject.toml pyproject.toml
-COPY --chown=${NB_UID} src src
-RUN pip install --no-cache-dir ".[torch,dev]" \
-    && itwinai sanity-check --torch \
+# Installation sanity check
+RUN itwinai sanity-check --torch \
     --optional-deps deepspeed \
     --optional-deps horovod \
     --optional-deps ray
diff --git a/env-files/torch/slim.Dockerfile b/env-files/torch/slim.Dockerfile
index fd8e2d01..187046d5 100644
--- a/env-files/torch/slim.Dockerfile
+++ b/env-files/torch/slim.Dockerfile
@@ -48,26 +48,26 @@ RUN /usr/bin/python3.10 -m venv /opt/venv \
     && pip install --no-cache-dir --upgrade pip \
     # https://github.com/mpi4py/mpi4py/pull/431
     && env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py \
-    && pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cu124 \
-    "torch==2.4.*" \
-    torchvision \
-    torchaudio \
+    && pip install --no-cache-dir \
     # Needed to install horovod
     wheel
 
+# Install itwinai with torch
+WORKDIR /app
+COPY pyproject.toml pyproject.toml
+COPY src src
+RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124
+
+# Install DeepSpeed, Horovod and Ray
 RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)')" \
     && pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \
     deepspeed==0.15.* \
     git+https://github.com/horovod/horovod.git@3a31d93 \
-    "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \
     ray[tune] 
 
-WORKDIR /app
-COPY src src
-COPY pyproject.toml pyproject.toml
 
-RUN pip install --no-cache-dir .[torch,dev] \
-    && itwinai sanity-check --torch \
+# Installation sanity check
+RUN itwinai sanity-check --torch \
     --optional-deps deepspeed \
     --optional-deps horovod \
     --optional-deps ray
diff --git a/pyproject.toml b/pyproject.toml
index e389b5d4..7ce59730 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,11 +44,11 @@ dependencies = [
 
 [project.optional-dependencies]
 torch = [
-  "torch==2.4.*",
-  "lightning==2.*",
-  "torchmetrics>=1.6.0",
-  "torchvision>=0.19.1",
-  "torchaudio>=2.4.1",
+    "torch==2.4.*",
+    "lightning==2.*",
+    "torchmetrics>=1.6.0",
+    "torchvision>=0.19.1",
+    "torchaudio>=2.4.1",
 ]
 dev = [
     "pytest>=7.4.2",
@@ -59,19 +59,16 @@ dev = [
     "isort",
     "tensorflow==2.16.*",  # needed by tests on tensorboard
 ]
-docs = [ 
-  "sphinx-rtd-theme==2.0.0",
-  "nbsphinx==0.9.4", 
-  "myst-parser==2.0.0",
-  "IPython",
-  "tensorflow==2.16.*",
-]
-macos = [
-    "prov4ml[apple]@git+https://github.com/matbun/ProvML"
-]
-linux = [
-    "prov4ml[linux]@git+https://github.com/matbun/ProvML"
+docs = [
+    "sphinx-rtd-theme==2.0.0",
+    "nbsphinx==0.9.4",
+    "myst-parser==2.0.0",
+    "IPython",
+    "tensorflow==2.16.*",
 ]
+macos = ["prov4ml[apple]@git+https://github.com/matbun/ProvML@new-main"]
+nvidia = ["prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main"]
+amd = ["prov4ml[amd]@git+https://github.com/matbun/ProvML@new-main"]
 
 [project.urls]
 Homepage = "https://www.intertwin.eu/"
diff --git a/tests/torch/slurm.vega.sh b/tests/torch/slurm.vega.sh
index 527ebd1e..1b4df076 100644
--- a/tests/torch/slurm.vega.sh
+++ b/tests/torch/slurm.vega.sh
@@ -42,7 +42,7 @@ echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
 
 ml --force purge
 # ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3
-ml Python CMake/3.24.3-GCCcore-11.3.0 OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3
+ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3
 ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0
 ml UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0