-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: add infinityhub dockerfiles * remove duplicate rocm * move deepspeed rocm install to separate shell script --------- Co-authored-by: root <[email protected]>
- Loading branch information
1 parent
71c41c7
commit 9179301
Showing
7 changed files
with
280 additions
and
29 deletions.
There are no files selected for viewing
Validating CODEOWNERS rules …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
ARG BASE_IMAGE | ||
FROM ${BASE_IMAGE} | ||
|
||
|
||
RUN apt install rocm-libs | ||
RUN apt remove -y openmpi ucx | ||
#Remove existing /opt/ompi; and, link to our version. | ||
RUN rm -rf /opt/ompi | ||
RUN ln -s /container/ompi /opt | ||
COPY dockerfile_scripts /tmp/det_dockerfile_scripts | ||
|
||
ARG WITH_MPI=1 | ||
ARG WITH_OFI=1 | ||
ARG WITH_MPICH | ||
ARG UCX_INSTALL_DIR=/container/ucx | ||
ARG OMPI_INSTALL_DIR=/container/ompi | ||
ARG MPICH_INSTALL_DIR=/container/mpich | ||
ARG OFI_INSTALL_DIR=/container/ofi | ||
ARG OMPI_WITH_ROCM=1 | ||
RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi | ||
|
||
# Make sure OMPI/UCX show up in the right paths | ||
ARG VERBS_LIB_DIR=/usr/lib/libibverbs | ||
ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64 | ||
ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin | ||
ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64 | ||
ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin | ||
ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib | ||
ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin | ||
ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib | ||
ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin | ||
|
||
# Set up UCX_LIBS and OFI_LIBS | ||
ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:" | ||
ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:" | ||
|
||
# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string | ||
ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}" | ||
|
||
# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs | ||
ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}" | ||
|
||
# But, only add them if WITH_MPI | ||
ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH | ||
|
||
#USING OFI | ||
ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}} | ||
|
||
#USING UCX | ||
ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}} | ||
|
||
ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH | ||
|
||
# Enable running OMPI as root | ||
ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1} | ||
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1} | ||
|
||
|
||
ARG AWS_PLUGIN_INSTALL_DIR=/container/aws | ||
ARG WITH_AWS_TRACE | ||
ARG INTERNAL_AWS_DS | ||
ARG INTERNAL_AWS_PATH | ||
ARG ROCM_DIR=/opt/rocm | ||
ENV ROCM_DIR $ROCM_DIR | ||
RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi | ||
ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH | ||
|
||
# Set an entrypoint that can scrape up the host libfabric.so and then | ||
# run the user command. This is intended to enable performant execution | ||
# on non-IB systems that have a proprietary libfabric. | ||
|
||
ARG WITH_RCCL=1 | ||
ENV WITH_RCCL=$WITH_RCCL | ||
ARG WITH_NFS_WORKAROUND=1 | ||
ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND | ||
|
||
RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin | ||
ENTRYPOINT ["/container/bin/scrape_libs.sh"] | ||
|
||
RUN rm -r /tmp/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
ARG BASE_IMAGE | ||
FROM ${BASE_IMAGE} | ||
ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 | ||
ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python | ||
|
||
RUN mkdir -p /var/run/sshd | ||
RUN rm /etc/apt/sources.list.d/rocm.list | ||
RUN pip install --upgrade pip | ||
|
||
COPY dockerfile_scripts /tmp/det_dockerfile_scripts | ||
|
||
RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh | ||
|
||
|
||
# Install fixed version of FFI package for Ubuntu 20.04. | ||
# This is done after above stuff to make sure we get right version. | ||
RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh | ||
RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh | ||
RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh | ||
|
||
|
||
RUN python -m pip install determined && pip uninstall -y determined | ||
|
||
RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt | ||
|
||
RUN python -m pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && \ | ||
jupyter labextension disable "@jupyterlab/apputils-extension:announcements" | ||
|
||
ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config | ||
ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data | ||
ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime | ||
|
||
ARG DEEPSPEED_PIP | ||
RUN if [ -n "$DEEPSPEED_PIP" ]; then /tmp/det_dockerfile_scripts/install_deepspeed_rocm.sh; fi | ||
|
||
# MIOPEN_DEBUG_SAVE_TEMP_DIR is required | ||
ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1 | ||
|
||
CMD ["/bin/bash"] | ||
USER root | ||
|
||
RUN rm -r /tmp/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
0.33.1 | ||
0.35.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,22 @@ | ||
attrdict3 | ||
pandas | ||
matplotlib | ||
tensorflow-datasets==1.3.2 | ||
Keras-Preprocessing[image] | ||
# TODO(DET-4259) Remove this when we fix the circular dependency with the main repo. | ||
petname | ||
azure-storage-blob | ||
Pillow>=8.3.2,<=9.5.0 | ||
analytics-python | ||
nvidia-ml-py | ||
# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf | ||
# Horovod cannot build with protobuf > 3.20.x | ||
# latest google-api-python-client requires protobuf >= 3.20.1 | ||
protobuf<=3.20.3 | ||
tensorboard==2.10.1 | ||
pynvml | ||
tokenizers==0.13.0 | ||
huggingface-hub==0.16.4 | ||
huggingface-hub==0.16.4 | ||
# necessary for benchmarks, but really should go into startup-hook.sh for that workflow | ||
accelerate>=0.12.0 | ||
datasets | ||
sentencepiece | ||
evaluate | ||
scikit-learn | ||
transformers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
|
||
apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev | ||
|
||
#Older versions of deepspeed require pinned pydantic version | ||
python -m pip install pydantic==1.10.11 ninja cmake | ||
|
||
#Precompile supported deepspeed ops except sparse_attn | ||
export DS_BUILD_OPS=1 | ||
export DS_BUILD_AIO=0 | ||
export DS_BUILD_SPARSE_ATTN=0 | ||
export DS_BUILD_EVOFORMER_ATTN=0 | ||
export DS_BUILD_CUTLASS_OPS=0 | ||
export DS_BUILD_CCL_COMM=0 | ||
|
||
python -m pip install $DEEPSPEED_PIP --no-binary deepspeed | ||
python -m deepspeed.env_report |