forked from determined-ai/environments
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile.gpu
127 lines (110 loc) · 5.37 KB
/
Dockerfile.gpu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
ARG CUDA
FROM nvidia/cuda:${CUDA}-cudnn7-devel-ubuntu18.04
RUN rm -f /etc/apt/sources.list.d/*
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN mkdir -p /etc/determined/conda.d
RUN mkdir -p /var/run/sshd
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
automake \
autotools-dev \
build-essential \
ca-certificates \
curl \
libkrb5-dev \
libssl-dev \
libtool \
git \
krb5-user \
cmake \
g++-4.8 \
libnccl-dev \
libnccl2 \
make \
openssh-client \
openssh-server \
pkg-config \
wget \
&& rm -rf /var/lib/apt/lists/*
ARG CONDA_DIR="/opt/conda"
ARG CONDA_INSTALLER="Miniconda3-4.7.10-Linux-x86_64.sh"
ARG CONDA_MD5="1c945f2b3335c7b2b15130b1b2dc5cf4"
ARG CONDA_URL="https://repo.continuum.io/miniconda"
ENV CONDA_DIR="${CONDA_DIR}" PATH="${CONDA_DIR}/bin:${PATH}"
RUN cd /tmp \
&& mkdir -p "${CONDA_DIR}" \
&& curl --retry 3 -fsSL -O "${CONDA_URL}/${CONDA_INSTALLER}" \
&& echo "${CONDA_MD5} ${CONDA_INSTALLER}" | md5sum -c - \
&& bash "./${CONDA_INSTALLER}" -u -b -p "${CONDA_DIR}" \
&& rm -f "./${CONDA_INSTALLER}"
ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0
RUN conda install python=3.6
COPY profiler /opt/determined/profiler
# Stock pyflame writes stack traces only at the end of the sampling period.
# Patch pyflame to have a new option that periodically dumps stack traces to
# facilitate profiling long running programs.
RUN mkdir -p /build \
&& cd /build \
&& curl -fsSL https://github.com/uber/pyflame/archive/v1.6.7.tar.gz | tar xzvf - \
&& cd /build/pyflame-1.6.7 \
&& patch -p1 < /opt/determined/profiler/pyflame-add-output-rate.patch \
&& patch -p1 < /opt/determined/profiler/pyflame-fix-for-conda.patch \
&& ./autogen.sh \
&& PY36_CFLAGS='-I/opt/conda/include/python3.6m' \
PY36_LIBS='-L/opt/conda/lib -lpython3.6m' \
./configure \
&& make install
RUN apt-get update -y && echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN apt-get install apt-transport-https ca-certificates gnupg -y
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
RUN apt-get update -y && apt-get install google-cloud-sdk -y
ARG TENSORFLOW_PIP
ARG TORCH_PIP
ARG TORCHVISION_PIP
ARG TENSORPACK_PIP
RUN if [ "$TENSORFLOW_PIP" ]; then ${CONDA_DIR}/bin/python3.6 -m pip install $TENSORFLOW_PIP; fi
RUN if [ "$TORCH_PIP" ]; then ${CONDA_DIR}/bin/python3.6 -m pip install $TORCH_PIP; fi
RUN if [ "$TORCHVISION_PIP" ]; then ${CONDA_DIR}/bin/python3.6 -m pip install $TORCHVISION_PIP; fi
RUN if [ "$TENSORPACK_PIP" ]; then ${CONDA_DIR}/bin/python3.6 -m pip install $TENSORPACK_PIP; fi
RUN ${CONDA_DIR}/bin/python3.6 -m pip install GPUtil
ARG TORCH_CUDA_ARCH_LIST="3.7;6.0;6.1;6.2;7.0;7.5"
RUN if [ "$TORCH_PIP" ]; then pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/determined-ai/apex.git@37cdaf4ad57ab4e7dd9ef13dbed7b29aa939d061; fi
ARG HOROVOD_WITH_TENSORFLOW=1
ARG HOROVOD_WITH_PYTORCH=1
ARG HOROVOD_WITHOUT_MXNET=1
ARG HOROVOD_GPU_ALLREDUCE=NCCL
ARG HOROVOD_WITHOUT_MPI=1
ARG HOROVOD_NCCL_HOME=/tmp/det_nccl/build
ARG HOROVOD_NCCL_LINK=STATIC
RUN git clone https://github.com/determined-ai/nccl.git /tmp/det_nccl\
&& (cd /tmp/det_nccl && git checkout -q 136b9edbaa8eb5af36db2e0d125fd3e83eec4017)\
&& make -C /tmp/det_nccl -j 4\
&& ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
&& pip install --no-cache-dir git+https://github.com/determined-ai/horovod.git@b04a363dd53a890d38c8a6762628e14822cc0362\
&& ldconfig\
&& rm -rf /tmp/det_nccl
RUN ${CONDA_DIR}/bin/python3.6 -m pip install determined determined-cli
# We uninstall these packages after installing. This ensures that we can
# successfully install these packages into containers running as non-root.
# `pip` does not uninstall dependencies, so we still have all the dependencies
# installed.
RUN ${CONDA_DIR}/bin/python3.6 -m pip uninstall -y determined determined-cli determined-common
COPY files/additional-requirements.txt files/additional-requirements.txt
RUN ${CONDA_DIR}/bin/python3.6 -m pip install -r files/additional-requirements.txt
COPY files/notebook-requirements.txt files/notebook-requirements.txt
RUN ${CONDA_DIR}/bin/python3.6 -m pip install -r files/notebook-requirements.txt
ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config JUPYTER_DATA_DIR=/run/determined/jupyter/data JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime
# Add a det-nobody user. Unlike the traditional nobody user, det-nobody will
# have a HOME directory that exists in order to support tools which require a
# home directory, like gsutil. det-nobody is designed to be an out-of-the-box
# solution for running workloads in unprivileged containers.
RUN groupadd --gid 65533 det-nobody \
&& useradd --create-home --home-dir /tmp/det-nobody --uid 65533 --gid 65533 --shell /bin/bash det-nobody
# Add a plugin to the user system that lets us extend the users available in
# the container at runtime. This is critical for supporting non-root shell,
# which in turn is critical for non-root distributed training.
COPY libnss_determined /tmp/libnss_determined
RUN make -C /tmp/libnss_determined libnss_determined.so.2 install \
&& rm -rf /tmp/libnss_determined \
&& sed -E -i -e 's/^((passwd|shadow|group):.*)/\1 determined/' /etc/nsswitch.conf