From 917930157b75523ffec7df094ed4818586d392f6 Mon Sep 17 00:00:00 2001 From: Michael Kardash Date: Tue, 13 Aug 2024 12:31:10 -0700 Subject: [PATCH] feat: Rocm dockerfile (#275) * feat: add infinityhub dockerfiles * remove duplicate rocm * move deepspeed rocm install to separate shell script --------- Co-authored-by: root --- .github/CODEOWNERS | 4 + Dockerfile-infinityhub-hpc | 80 ++++++++++ Dockerfile-infinityhub-pytorch | 42 +++++ Makefile | 146 +++++++++++++++--- VERSION | 2 +- .../additional-requirements-rocm.txt | 16 +- dockerfile_scripts/install_deepspeed_rocm.sh | 19 +++ 7 files changed, 280 insertions(+), 29 deletions(-) create mode 100644 Dockerfile-infinityhub-hpc create mode 100644 Dockerfile-infinityhub-pytorch create mode 100755 dockerfile_scripts/install_deepspeed_rocm.sh diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2594610d..1a3759fa 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -4,3 +4,7 @@ # CI and Cloud-specific images are infra /.circleci @determined-ai/infrastructure /cloud @determined-ai/infrastructure + +# Team-specific files +Dockerfile-infinityhub-pytorch @determined-ai/pad +Dockerfile-infinityhub-hpc @determined-ai/pad diff --git a/Dockerfile-infinityhub-hpc b/Dockerfile-infinityhub-hpc new file mode 100644 index 00000000..ce6cf0f4 --- /dev/null +++ b/Dockerfile-infinityhub-hpc @@ -0,0 +1,80 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + + +RUN apt install rocm-libs +RUN apt remove -y openmpi ucx +#Remove existing /opt/ompi; and, link to our version. +RUN rm -rf /opt/ompi +RUN ln -s /container/ompi /opt +COPY dockerfile_scripts /tmp/det_dockerfile_scripts + +ARG WITH_MPI=1 +ARG WITH_OFI=1 +ARG WITH_MPICH +ARG UCX_INSTALL_DIR=/container/ucx +ARG OMPI_INSTALL_DIR=/container/ompi +ARG MPICH_INSTALL_DIR=/container/mpich +ARG OFI_INSTALL_DIR=/container/ofi +ARG OMPI_WITH_ROCM=1 +RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi + +# Make sure OMPI/UCX show up in the right paths +ARG VERBS_LIB_DIR=/usr/lib/libibverbs +ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64 +ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin +ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64 +ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin +ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib +ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin +ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib +ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin + +# Set up UCX_LIBS and OFI_LIBS +ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:" +ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:" + +# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string +ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}" + +# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs +ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}" + +# But, only add them if WITH_MPI +ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH + +#USING OFI +ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}} + +#USING UCX +ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}} + +ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH + +# Enable running OMPI as root +ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1} +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1} + + +ARG AWS_PLUGIN_INSTALL_DIR=/container/aws +ARG WITH_AWS_TRACE +ARG INTERNAL_AWS_DS +ARG INTERNAL_AWS_PATH +ARG ROCM_DIR=/opt/rocm +ENV ROCM_DIR $ROCM_DIR +RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi +ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH + +# Set an entrypoint that can scrape up the host libfabric.so and then +# run the user command. This is intended to enable performant execution +# on non-IB systems that have a proprietary libfabric. + +ARG WITH_RCCL=1 +ENV WITH_RCCL=$WITH_RCCL +ARG WITH_NFS_WORKAROUND=1 +ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND + +RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin +ENTRYPOINT ["/container/bin/scrape_libs.sh"] + +RUN rm -r /tmp/* diff --git a/Dockerfile-infinityhub-pytorch b/Dockerfile-infinityhub-pytorch new file mode 100644 index 00000000..7d1f22d8 --- /dev/null +++ b/Dockerfile-infinityhub-pytorch @@ -0,0 +1,42 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} +ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 +ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +RUN mkdir -p /var/run/sshd +RUN rm /etc/apt/sources.list.d/rocm.list +RUN pip install --upgrade pip + +COPY dockerfile_scripts /tmp/det_dockerfile_scripts + +RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh + + +# Install fixed version of FFI package for Ubuntu 20.04. +# This is done after above stuff to make sure we get right version. +RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh +RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh +RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh + + +RUN python -m pip install determined && pip uninstall -y determined + +RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt + +RUN python -m pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && \ + jupyter labextension disable "@jupyterlab/apputils-extension:announcements" + +ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config +ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data +ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime + +ARG DEEPSPEED_PIP +RUN if [ -n "$DEEPSPEED_PIP" ]; then /tmp/det_dockerfile_scripts/install_deepspeed_rocm.sh; fi + +# MIOPEN_DEBUG_SAVE_TEMP_DIR is required +ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1 + +CMD ["/bin/bash"] +USER root + +RUN rm -r /tmp/* diff --git a/Makefile b/Makefile index 3ec47d21..2ce61aee 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,11 @@ CPU_PREFIX_310 := $(REGISTRY_REPO):py-3.10- CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3- CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8- ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6- +ROCM_57_PREFIX := $(REGISTRY_REPO):rocm-5.7- +ROCM_60_PREFIX := $(REGISTRY_REPO):rocm-6.0- +ROCM_61_PREFIX := $(REGISTRY_REPO):rocm-6.1- +ROCM_60_TF_PREFIX := tensorflow-infinity-hub:tensorflow-infinity-hub + CPU_SUFFIX := -cpu CUDA_SUFFIX := -cuda @@ -140,6 +145,9 @@ NGC_PYTORCH_HPC_REPO := pytorch-ngc-hpc-dev NGC_TF_REPO := tensorflow-ngc-dev NGC_TF_HPC_REPO := tensorflow-ngc-hpc-dev +INFINITYHUB_PYTORCH_REPO := pytorch-infinityhub-dev +INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev + # build hpc together since hpc is dependent on the normal build .PHONY: build-pytorch-ngc build-pytorch-ngc: @@ -163,39 +171,131 @@ build-tensorflow-ngc: -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \ . + +ROCM_DEEPSPEED_VERSION := 0.14.4 +WITH_MPICH=1 +export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch-infinityhub +build-pytorch-infinityhub: + docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(ROCM_DEEPSPEED_VERSION)" \ + -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO):$(SHORT_GIT_HASH) \ + . + docker build --shm-size='1gb' -f Dockerfile-infinityhub-hpc \ + --build-arg BASE_IMAGE=$(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO):$(SHORT_GIT_HASH) \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \ + . + + ifeq ($(WITH_MPICH),1) -ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich +ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich else -ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi +ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi endif -export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI) -.PHONY: build-pytorch13-tf210-rocm56 -build-pytorch13-tf210-rocm56: +export ROCM61_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH13_MPI) +.PHONY: build-pytorch13-tf210-rocm61 +build-pytorch13-tf210-rocm61: docker build -f Dockerfile-default-rocm \ - --build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\ - --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ - --build-arg HOROVOD_PIP="horovod==0.28.1" \ - --build-arg WITH_MPICH=$(WITH_MPICH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \ - . + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_1.13.1" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + + ifeq ($(WITH_MPICH),1) -ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich else -ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi endif -export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI) -.PHONY: build-pytorch20-tf210-rocm56 -build-pytorch20-tf210-rocm56: +export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI) +.PHONY: build-pytorch20-tf210-rocm61 +build-pytorch20-tf210-rocm61: docker build -f Dockerfile-default-rocm \ - --build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \ - --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ - --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="0" \ --build-arg WITH_MPICH=$(WITH_MPICH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ - . + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + +ifeq ($(WITH_MPICH),1) +ROCM61_TORCH_MPI :=pytorch-3.10-rocm-mpich +else +ROCM61_TORCH_MPI :=pytorch-3.10-rocm-ompi +endif +export ROCM61_TORCH_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI) +.PHONY: build-pytorch20-rocm61 +build-pytorch20-rocm61: + docker build -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="0" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(VERSION) \ + . + + + + +export ROCM61_TF_ENVIRONMENT_NAME := $(ROCM_61_TF_PREFIX) +build-tf210-rocm61: + docker build -f Dockerfile-tensorflow-rocm \ + --build-arg BASE_IMAGE="rocm/tensorflow:rocm6.1-py3.9-tf2.15-dev" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + + +DEEPSPEED_VERSION := 0.8.3 +export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX) +export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX) +export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html + +export ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_57_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch20-tf210-rocm57-deepspeed +build-pytorch20-tf210-rocm57-deepspeed: + docker build --shm-size='1gb' -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.1.1" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ + . + +export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch20-tf210-rocm61-deepspeed +build-pytorch20-tf210-rocm61-deepspeed: + docker build --shm-size='1gb' -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ + . + + DEEPSPEED_VERSION := 0.8.3 export GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := deepspeed-cuda-gpt-neox diff --git a/VERSION b/VERSION index 8df3f459..731b95d7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.33.1 +0.35.1 diff --git a/dockerfile_scripts/additional-requirements-rocm.txt b/dockerfile_scripts/additional-requirements-rocm.txt index 27585b8e..95207da6 100644 --- a/dockerfile_scripts/additional-requirements-rocm.txt +++ b/dockerfile_scripts/additional-requirements-rocm.txt @@ -1,16 +1,22 @@ attrdict3 pandas matplotlib -tensorflow-datasets==1.3.2 -Keras-Preprocessing[image] # TODO(DET-4259) Remove this when we fix the circular dependency with the main repo. petname azure-storage-blob Pillow>=8.3.2,<=9.5.0 analytics-python -nvidia-ml-py +# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf +# Horovod cannot build with protobuf > 3.20.x +# latest google-api-python-client requires protobuf >= 3.20.1 protobuf<=3.20.3 tensorboard==2.10.1 -pynvml tokenizers==0.13.0 -huggingface-hub==0.16.4 +huggingface-hub==0.16.4 +# necessary for benchmarks, but really should go into startup-hook.sh for that workflow +accelerate>=0.12.0 +datasets +sentencepiece +evaluate +scikit-learn +transformers diff --git a/dockerfile_scripts/install_deepspeed_rocm.sh b/dockerfile_scripts/install_deepspeed_rocm.sh new file mode 100755 index 00000000..6ddbf1ec --- /dev/null +++ b/dockerfile_scripts/install_deepspeed_rocm.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -e + +apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev + +#Older versions of deepspeed require pinned pydantic version +python -m pip install pydantic==1.10.11 ninja cmake + +#Precompile supported deepspeed ops except sparse_attn +export DS_BUILD_OPS=1 +export DS_BUILD_AIO=0 +export DS_BUILD_SPARSE_ATTN=0 +export DS_BUILD_EVOFORMER_ATTN=0 +export DS_BUILD_CUTLASS_OPS=0 +export DS_BUILD_CCL_COMM=0 + +python -m pip install $DEEPSPEED_PIP --no-binary deepspeed +python -m deepspeed.env_report