Skip to content

Commit

Permalink
feat: Rocm dockerfile (#275)
Browse files Browse the repository at this point in the history
* feat: add infinityhub dockerfiles

* remove duplicate rocm

* move deepspeed rocm install to separate shell script

---------

Co-authored-by: root <[email protected]>
  • Loading branch information
MikhailKardash and will-HPE authored Aug 13, 2024
1 parent 71c41c7 commit 9179301
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 29 deletions.
4 changes: 4 additions & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@
# CI and Cloud-specific images are infra
/.circleci @determined-ai/infrastructure
/cloud @determined-ai/infrastructure

# Team-specific files
Dockerfile-infinityhub-pytorch @determined-ai/pad
Dockerfile-infinityhub-hpc @determined-ai/pad
80 changes: 80 additions & 0 deletions Dockerfile-infinityhub-hpc
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}


RUN apt install rocm-libs
RUN apt remove -y openmpi ucx
#Remove existing /opt/ompi; and, link to our version.
RUN rm -rf /opt/ompi
RUN ln -s /container/ompi /opt
COPY dockerfile_scripts /tmp/det_dockerfile_scripts

ARG WITH_MPI=1
ARG WITH_OFI=1
ARG WITH_MPICH
ARG UCX_INSTALL_DIR=/container/ucx
ARG OMPI_INSTALL_DIR=/container/ompi
ARG MPICH_INSTALL_DIR=/container/mpich
ARG OFI_INSTALL_DIR=/container/ofi
ARG OMPI_WITH_ROCM=1
RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi

# Make sure OMPI/UCX show up in the right paths
ARG VERBS_LIB_DIR=/usr/lib/libibverbs
ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64
ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin
ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64
ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin
ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib
ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin
ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib
ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin

# Set up UCX_LIBS and OFI_LIBS
ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:"
ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:"

# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string
ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}"

# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs
ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}"

# But, only add them if WITH_MPI
ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH

#USING OFI
ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}}

#USING UCX
ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}

ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH

# Enable running OMPI as root
ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}


ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
ARG WITH_AWS_TRACE
ARG INTERNAL_AWS_DS
ARG INTERNAL_AWS_PATH
ARG ROCM_DIR=/opt/rocm
ENV ROCM_DIR $ROCM_DIR
RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi
ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH

# Set an entrypoint that can scrape up the host libfabric.so and then
# run the user command. This is intended to enable performant execution
# on non-IB systems that have a proprietary libfabric.

ARG WITH_RCCL=1
ENV WITH_RCCL=$WITH_RCCL
ARG WITH_NFS_WORKAROUND=1
ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND

RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin
ENTRYPOINT ["/container/bin/scrape_libs.sh"]

RUN rm -r /tmp/*
42 changes: 42 additions & 0 deletions Dockerfile-infinityhub-pytorch
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0
ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

RUN mkdir -p /var/run/sshd
RUN rm /etc/apt/sources.list.d/rocm.list
RUN pip install --upgrade pip

COPY dockerfile_scripts /tmp/det_dockerfile_scripts

RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh


# Install fixed version of FFI package for Ubuntu 20.04.
# This is done after above stuff to make sure we get right version.
RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh
RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh
RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh


RUN python -m pip install determined && pip uninstall -y determined

RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt

RUN python -m pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && \
jupyter labextension disable "@jupyterlab/apputils-extension:announcements"

ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data
ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime

ARG DEEPSPEED_PIP
RUN if [ -n "$DEEPSPEED_PIP" ]; then /tmp/det_dockerfile_scripts/install_deepspeed_rocm.sh; fi

# MIOPEN_DEBUG_SAVE_TEMP_DIR is required
ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1

CMD ["/bin/bash"]
USER root

RUN rm -r /tmp/*
146 changes: 123 additions & 23 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ CPU_PREFIX_310 := $(REGISTRY_REPO):py-3.10-
CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3-
CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8-
ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6-
ROCM_57_PREFIX := $(REGISTRY_REPO):rocm-5.7-
ROCM_60_PREFIX := $(REGISTRY_REPO):rocm-6.0-
ROCM_61_PREFIX := $(REGISTRY_REPO):rocm-6.1-
ROCM_60_TF_PREFIX := tensorflow-infinity-hub:tensorflow-infinity-hub


CPU_SUFFIX := -cpu
CUDA_SUFFIX := -cuda
Expand Down Expand Up @@ -140,6 +145,9 @@ NGC_PYTORCH_HPC_REPO := pytorch-ngc-hpc-dev
NGC_TF_REPO := tensorflow-ngc-dev
NGC_TF_HPC_REPO := tensorflow-ngc-hpc-dev

INFINITYHUB_PYTORCH_REPO := pytorch-infinityhub-dev
INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev

# build hpc together since hpc is dependent on the normal build
.PHONY: build-pytorch-ngc
build-pytorch-ngc:
Expand All @@ -163,39 +171,131 @@ build-tensorflow-ngc:
-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
.


ROCM_DEEPSPEED_VERSION := 0.14.4
WITH_MPICH=1
export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
.PHONY: build-pytorch-infinityhub
build-pytorch-infinityhub:
docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg DEEPSPEED_PIP="deepspeed==$(ROCM_DEEPSPEED_VERSION)" \
-t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO):$(SHORT_GIT_HASH) \
.
docker build --shm-size='1gb' -f Dockerfile-infinityhub-hpc \
--build-arg BASE_IMAGE=$(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO):$(SHORT_GIT_HASH) \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \
.


ifeq ($(WITH_MPICH),1)
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
else
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
endif
export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI)
.PHONY: build-pytorch13-tf210-rocm56
build-pytorch13-tf210-rocm56:
export ROCM61_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH13_MPI)
.PHONY: build-pytorch13-tf210-rocm61
build-pytorch13-tf210-rocm61:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
.
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_1.13.1" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
.



ifeq ($(WITH_MPICH),1)
ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
else
ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
endif
export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI)
.PHONY: build-pytorch20-tf210-rocm56
build-pytorch20-tf210-rocm56:
export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI)
.PHONY: build-pytorch20-tf210-rocm61
build-pytorch20-tf210-rocm61:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="0" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.

ifeq ($(WITH_MPICH),1)
ROCM61_TORCH_MPI :=pytorch-3.10-rocm-mpich
else
ROCM61_TORCH_MPI :=pytorch-3.10-rocm-ompi
endif
export ROCM61_TORCH_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI)
.PHONY: build-pytorch20-rocm61
build-pytorch20-rocm61:
docker build -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg TENSORFLOW_PIP="0" \
--build-arg HOROVOD_PIP="0" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(VERSION) \
.




export ROCM61_TF_ENVIRONMENT_NAME := $(ROCM_61_TF_PREFIX)
build-tf210-rocm61:
docker build -f Dockerfile-tensorflow-rocm \
--build-arg BASE_IMAGE="rocm/tensorflow:rocm6.1-py3.9-tf2.15-dev" \
--build-arg HOROVOD_PIP="0" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TF_ENVIRONMENT_NAME)-$(VERSION) \
.


DEEPSPEED_VERSION := 0.8.3
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

export ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_57_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
.PHONY: build-pytorch20-tf210-rocm57-deepspeed
build-pytorch20-tf210-rocm57-deepspeed:
docker build --shm-size='1gb' -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.1.1" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="horovod==0.28.1" \
--build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \
.

export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed
.PHONY: build-pytorch20-tf210-rocm61-deepspeed
build-pytorch20-tf210-rocm61-deepspeed:
docker build --shm-size='1gb' -f Dockerfile-default-rocm \
--build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
--build-arg HOROVOD_PIP="0" \
--build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \
--build-arg WITH_MPICH=$(WITH_MPICH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \
.



DEEPSPEED_VERSION := 0.8.3
export GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := deepspeed-cuda-gpt-neox
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.33.1
0.35.1
16 changes: 11 additions & 5 deletions dockerfile_scripts/additional-requirements-rocm.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
attrdict3
pandas
matplotlib
tensorflow-datasets==1.3.2
Keras-Preprocessing[image]
# TODO(DET-4259) Remove this when we fix the circular dependency with the main repo.
petname
azure-storage-blob
Pillow>=8.3.2,<=9.5.0
analytics-python
nvidia-ml-py
# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf
# Horovod cannot build with protobuf > 3.20.x
# latest google-api-python-client requires protobuf >= 3.20.1
protobuf<=3.20.3
tensorboard==2.10.1
pynvml
tokenizers==0.13.0
huggingface-hub==0.16.4
huggingface-hub==0.16.4
# necessary for benchmarks, but really should go into startup-hook.sh for that workflow
accelerate>=0.12.0
datasets
sentencepiece
evaluate
scikit-learn
transformers
19 changes: 19 additions & 0 deletions dockerfile_scripts/install_deepspeed_rocm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

set -e

apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev

#Older versions of deepspeed require pinned pydantic version
python -m pip install pydantic==1.10.11 ninja cmake

#Precompile supported deepspeed ops except sparse_attn
export DS_BUILD_OPS=1
export DS_BUILD_AIO=0
export DS_BUILD_SPARSE_ATTN=0
export DS_BUILD_EVOFORMER_ATTN=0
export DS_BUILD_CUTLASS_OPS=0
export DS_BUILD_CCL_COMM=0

python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
python -m deepspeed.env_report

0 comments on commit 9179301

Please sign in to comment.