Skip to content

Commit

Permalink
Update deps
Browse files Browse the repository at this point in the history
  • Loading branch information
matbun committed Nov 21, 2024
1 parent d41f6dd commit 0cae49b
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 55 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ environment for PyTorch:

```bash
ml --force purge
ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
```

Expand All @@ -80,7 +80,7 @@ environment for TensorFlow:

```bash
ml --force purge
ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
```

Expand Down Expand Up @@ -227,7 +227,7 @@ Commands to be executed before activating the python virtual environment:

```bash
ml --force purge
ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
```

Expand Down Expand Up @@ -261,7 +261,7 @@ Commands to be executed before activating the python virtual environment:

```bash
ml --force purge
ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
```

Expand Down
3 changes: 1 addition & 2 deletions env-files/tensorflow/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ RUN apt-get update && apt-get install -y \
RUN pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir \
tf_keras==2.16.* \
"prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \
ray[tune]

# Inprove robustness: avoid silent override by Singularity/Apptainer
Expand All @@ -42,7 +41,7 @@ ENV PYTHONPATH=""
# Install itwinai
COPY pyproject.toml pyproject.toml
COPY src src
RUN pip install --no-cache-dir . \
RUN pip install --no-cache-dir .[nvidia] \
&& itwinai sanity-check --tensorflow --optional-deps ray

# Additional pip deps
Expand Down
18 changes: 8 additions & 10 deletions env-files/torch/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@ RUN apt-get update && apt-get install -y \
RUN pip install --no-cache-dir --upgrade pip \
&& env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py

# DeepSpeed, Horovod and other deps
# Install itwinai with torch
COPY pyproject.toml pyproject.toml
COPY src src
RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124

# Install DeepSpeed, Horovod and Ray
ENV HOROVOD_WITH_PYTORCH=1 \
HOROVOD_WITHOUT_TENSORFLOW=1 \
HOROVOD_WITHOUT_MXNET=1 \
Expand All @@ -56,21 +61,14 @@ RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)'
&& pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \
deepspeed==0.15.* \
git+https://github.com/horovod/horovod.git@3a31d93 \
"prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \
ray[tune] \
# fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug
&& pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)" \
&& line=$(cat -n "/usr/local/lib/python${pver}/dist-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py" | grep os.rename | awk '{print $1}' | head -n 1) \
&& sed -i "${line}s|^|#|" "/usr/local/lib/python${pver}/dist-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py"

# Install itwinai
COPY pyproject.toml pyproject.toml
COPY src src
# Torch: reuse the global torch in the container
RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)')" \
&& pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \
.[torch,dev] \
&& itwinai sanity-check --torch \
# Installation sanity check
RUN itwinai sanity-check --torch \
--optional-deps deepspeed \
--optional-deps horovod \
--optional-deps ray
Expand Down
2 changes: 1 addition & 1 deletion env-files/torch/createEnvVega.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ fi
# Load modules
# NOTE: REFLECT THEM IN THE MAIN README!
ml --force purge
ml Python
ml Python/3.11.5-GCCcore-13.2.0
ml CMake/3.24.3-GCCcore-11.3.0
ml mpi4py
ml OpenMPI
Expand Down
21 changes: 11 additions & 10 deletions env-files/torch/jupyter/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,14 @@ RUN pip install --no-cache-dir --upgrade pip \
'numpy<2' \
packaging \
py-cpuinfo \
torch==2.4.* torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \
wheel

# Install itwinai with torch
WORKDIR "$HOME/itwinai"
COPY --chown=${NB_UID} pyproject.toml pyproject.toml
COPY --chown=${NB_UID} src src
RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124

# Apex: https://github.com/NVIDIA/apex
# (needed for DeepSpeed *_FUSED optinal build options)
# Note: it will take more than an hour to build
Expand All @@ -225,7 +230,9 @@ RUN git clone https://github.com/NVIDIA/apex && cd apex \
# Note: it will take about half an hour to build
RUN pip install --no-cache-dir transformer_engine[pytorch]

# DeepSpeed, Horovod and other deps
WORKDIR "$HOME/itwinai"

# Install DeepSpeed, Horovod and Ray
ENV HOROVOD_WITH_PYTORCH=1 \
HOROVOD_WITHOUT_TENSORFLOW=1 \
HOROVOD_WITHOUT_MXNET=1 \
Expand All @@ -247,16 +254,10 @@ ENV HOROVOD_WITH_PYTORCH=1 \
RUN pip install --no-cache-dir \
deepspeed==0.15.* \
git+https://github.com/horovod/horovod.git@3a31d93 \
"prov4ml[nvidia]@git+https://github.com/matbun/ProvML@e99bddba047a1a04ca0387e87cad1ed96d5ab2c5" \
ray[tune]


# Core itwinai lib
WORKDIR "$HOME/itwinai"
COPY --chown=${NB_UID} pyproject.toml pyproject.toml
COPY --chown=${NB_UID} src src
RUN pip install --no-cache-dir ".[torch,dev]" \
&& itwinai sanity-check --torch \
# Installation sanity check
RUN itwinai sanity-check --torch \
--optional-deps deepspeed \
--optional-deps horovod \
--optional-deps ray
Expand Down
20 changes: 10 additions & 10 deletions env-files/torch/slim.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,26 +48,26 @@ RUN /usr/bin/python3.10 -m venv /opt/venv \
&& pip install --no-cache-dir --upgrade pip \
# https://github.com/mpi4py/mpi4py/pull/431
&& env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py \
&& pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cu124 \
"torch==2.4.*" \
torchvision \
torchaudio \
&& pip install --no-cache-dir \
# Needed to install horovod
wheel

# Install itwinai with torch
WORKDIR /app
COPY pyproject.toml pyproject.toml
COPY src src
RUN pip install --no-cache-dir .[torch,nvidia,dev] --extra-index-url https://download.pytorch.org/whl/cu124

# Install DeepSpeed, Horovod and Ray
RUN CONTAINER_TORCH_VERSION="$(python -c 'import torch;print(torch.__version__)')" \
&& pip install --no-cache-dir torch=="$CONTAINER_TORCH_VERSION" \
deepspeed==0.15.* \
git+https://github.com/horovod/horovod.git@3a31d93 \
"prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main" \
ray[tune]

WORKDIR /app
COPY src src
COPY pyproject.toml pyproject.toml

RUN pip install --no-cache-dir .[torch,dev] \
&& itwinai sanity-check --torch \
# Installation sanity check
RUN itwinai sanity-check --torch \
--optional-deps deepspeed \
--optional-deps horovod \
--optional-deps ray
Expand Down
31 changes: 14 additions & 17 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ dependencies = [

[project.optional-dependencies]
torch = [
"torch==2.4.*",
"lightning==2.*",
"torchmetrics>=1.6.0",
"torchvision>=0.19.1",
"torchaudio>=2.4.1",
"torch==2.4.*",
"lightning==2.*",
"torchmetrics>=1.6.0",
"torchvision>=0.19.1",
"torchaudio>=2.4.1",
]
dev = [
"pytest>=7.4.2",
Expand All @@ -59,19 +59,16 @@ dev = [
"isort",
"tensorflow==2.16.*", # needed by tests on tensorboard
]
docs = [
"sphinx-rtd-theme==2.0.0",
"nbsphinx==0.9.4",
"myst-parser==2.0.0",
"IPython",
"tensorflow==2.16.*",
]
macos = [
"prov4ml[apple]@git+https://github.com/matbun/ProvML"
]
linux = [
"prov4ml[linux]@git+https://github.com/matbun/ProvML"
docs = [
"sphinx-rtd-theme==2.0.0",
"nbsphinx==0.9.4",
"myst-parser==2.0.0",
"IPython",
"tensorflow==2.16.*",
]
macos = ["prov4ml[apple]@git+https://github.com/matbun/ProvML@new-main"]
nvidia = ["prov4ml[nvidia]@git+https://github.com/matbun/ProvML@new-main"]
amd = ["prov4ml[amd]@git+https://github.com/matbun/ProvML@new-main"]

[project.urls]
Homepage = "https://www.intertwin.eu/"
Expand Down
2 changes: 1 addition & 1 deletion tests/torch/slurm.vega.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"

ml --force purge
# ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3
ml Python CMake/3.24.3-GCCcore-11.3.0 OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 OpenMPI/4.1.5-GCC-12.3.0 CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0
ml UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0

Expand Down

0 comments on commit 0cae49b

Please sign in to comment.