-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile
51 lines (38 loc) · 1.55 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
FROM python:3.10 as builder
LABEL python_version=python3.10
# Add Dependencies for PySpark
RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates rsync openjdk-11-jdk-headless
# Fix the value of PYTHONHASHSEED
# Note: this is needed when you use Python 3.3 or greater
ENV SPARK_VERSION=spark-3.3.0 \
HADOOP_VERSION=hadoop-3.3.4 \
SPARK_HOME=/opt/spark \
PYTHONHASHSEED=1
RUN wget --no-verbose -O apache-spark.tgz "https://archive.apache.org/dist/spark/${SPARK_VERSION}/${SPARK_VERSION}-bin-without-hadoop.tgz" \
&& mkdir -p /opt/spark \
&& tar -xf apache-spark.tgz -C /opt/spark --strip-components=1 \
&& rm apache-spark.tgz
RUN wget --no-verbose -O hadoop.tgz "https://dlcdn.apache.org/hadoop/common/${HADOOP_VERSION}/${HADOOP_VERSION}.tar.gz" \
&& mkdir -p /opt/spark \
&& tar -xf hadoop.tgz -C /opt/spark --strip-components=1 \
&& rm hadoop.tgz
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
FROM builder as apache-spark
WORKDIR /opt/spark
ENV SPARK_MASTER_PORT=7077 \
SPARK_MASTER_WEBUI_PORT=8080 \
SPARK_LOG_DIR=/opt/spark/logs \
SPARK_MASTER_LOG=/opt/spark/logs/spark-master.out \
SPARK_WORKER_LOG=/opt/spark/logs/spark-worker.out \
SPARK_WORKER_WEBUI_PORT=8080 \
SPARK_WORKER_PORT=7000 \
SPARK_MASTER="spark://spark-master:7077" \
SPARK_WORKLOAD="master"
EXPOSE 8080 7077 7000
RUN mkdir -p $SPARK_LOG_DIR && \
touch $SPARK_MASTER_LOG && \
touch $SPARK_WORKER_LOG && \
ln -sf /dev/stdout $SPARK_MASTER_LOG && \
ln -sf /dev/stdout $SPARK_WORKER_LOG
COPY start-spark.sh /
CMD ["/bin/bash", "/start-spark.sh"]