Skip to content

Commit c1c5025

Browse files
committed
add
1 parent 092cc6e commit c1c5025

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3432
-0
lines changed

spark-cluster/base/Dockerfile

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
FROM ubuntu:latest
2+
3+
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
4+
ENV HDFS_NAMENODE_USER=root
5+
ENV HDFS_DATANODE_USER=root
6+
ENV HDFS_SECONDARYNAMENODE_USER=root
7+
ENV YARN_RESOURCEMANAGER_USER=root
8+
ENV YARN_NODEMANAGER_USER=root
9+
ENV YARN_PROXYSERVER_USER=root
10+
ENV HADOOP_HOME=/usr/local/hadoop
11+
ENV HADOOP_YARN_HOME=${HADOOP_HOME}
12+
ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
13+
ENV HADOOP_LOG_DIR=${HADOOP_YARN_HOME}/logs
14+
ENV HADOOP_IDENT_STRING=root
15+
ENV HADOOP_MAPRED_IDENT_STRING=root
16+
ENV HADOOP_MAPRED_HOME=${HADOOP_HOME}
17+
ENV SPARK_HOME=/usr/local/spark
18+
ENV CONDA_HOME=/usr/local/conda
19+
ENV PYSPARK_MASTER=yarn
20+
ENV PATH=${CONDA_HOME}/bin:${SPARK_HOME}/bin:${HADOOP_HOME}/bin:${PATH}
21+
22+
# setup ubuntu
23+
RUN apt-get update -y \
24+
&& apt-get upgrade -y \
25+
&& apt-get -y install openjdk-17-jdk wget ssh openssh-server sshpass supervisor \
26+
&& apt-get -y install nano net-tools lynx \
27+
&& apt-get clean
28+
29+
# setup ssh
30+
RUN ssh-keygen -t rsa -P "" -f /root/.ssh/id_rsa \
31+
&& cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys \
32+
&& chmod 0600 /root/.ssh/authorized_keys
33+
COPY ubuntu/root/.ssh/config /root/.ssh/config
34+
35+
# setup hadoop
36+
RUN wget -q https://dlcdn.apache.org/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz -O /tmp/hadoop.tar.gz \
37+
&& tar -xzf /tmp/hadoop.tar.gz -C /usr/local/ \
38+
&& ln -s /usr/local/hadoop-3.4.1 /usr/local/hadoop \
39+
&& rm -fr /usr/local/hadoop/etc/hadoop/* \
40+
&& mkdir /usr/local/hadoop/extras \
41+
&& mkdir /var/hadoop \
42+
&& mkdir /var/hadoop/hadoop-datanode \
43+
&& mkdir /var/hadoop/hadoop-namenode \
44+
&& mkdir /var/hadoop/mr-history \
45+
&& mkdir /var/hadoop/mr-history/done \
46+
&& mkdir /var/hadoop/mr-history/tmp
47+
48+
# setup spark
49+
RUN wget -q https://dlcdn.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz -O /tmp/spark.tgz \
50+
&& tar -xzf /tmp/spark.tgz -C /usr/local/ \
51+
&& ln -s /usr/local/spark-3.5.5-bin-hadoop3 /usr/local/spark \
52+
&& rm /usr/local/spark/conf/*.template
53+
54+
# setup conda
55+
COPY ubuntu/root/environment.yml /tmp/environment.yml
56+
RUN wget -q https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh -O /tmp/anaconda.sh \
57+
&& /bin/bash /tmp/anaconda.sh -b -p $CONDA_HOME \
58+
&& $CONDA_HOME/bin/conda env update -n base --file /tmp/environment.yml \
59+
&& $CONDA_HOME/bin/conda update -n root conda -y \
60+
&& $CONDA_HOME/bin/conda update --all -y \
61+
&& $CONDA_HOME/bin/pip install --upgrade pip
62+
63+
# clean up
64+
RUN rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
65+
&& mkdir /tmp/spark-events

spark-cluster/base/Makefile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Name of the Docker image
2+
IMAGE_NAME=spark-base
3+
PLATFORM=linux/amd64
4+
5+
build:
6+
docker build --platform=$(PLATFORM) -t $(IMAGE_NAME) .
7+
8+
interactive:
9+
docker run -it --rm --platform=$(PLATFORM) $(IMAGE_NAME)
10+
11+
purge: clean
12+
docker rmi $(IMAGE_NAME) || true
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Host *
2+
StrictHostKeyChecking no
3+
UserKnownHostsFile=/dev/null
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
name: base
2+
channels:
3+
- default
4+
- anaconda
5+
- crogoz
6+
dependencies:
7+
- graphframes
8+
- networkx
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[program:all]
2+
command=/bin/bash /usr/local/bin/start-all.sh
3+
exitcodes=0
4+
startsecs=0
5+
priority=999
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
; supervisor config file
2+
3+
[unix_http_server]
4+
file=/var/run/supervisor.sock ; (the path to the socket file)
5+
chmod=0700 ; sockef file mode (default 0700)
6+
7+
[supervisord]
8+
logfile=/var/log/supervisor/supervisord.log ; (main log file;default $CWD/supervisord.log)
9+
pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
10+
childlogdir=/var/log/supervisor ; ('AUTO' child log dir, default $TEMP)
11+
loglevel=debug
12+
nodaemon=true
13+
user=root
14+
15+
; the below section must remain in the config file for RPC
16+
; (supervisorctl/web interface) to work, additional interfaces may be
17+
; added by defining them in separate rpcinterface: sections
18+
[rpcinterface:supervisor]
19+
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
20+
21+
[supervisorctl]
22+
serverurl=unix:///var/run/supervisor.sock ; use a unix:// URL for a unix socket
23+
24+
; The [include] section can just contain the "files" setting. This
25+
; setting can list multiple files (separated by whitespace or
26+
; newlines). It can also contain wildcards. The filenames are
27+
; interpreted as relative to this file. Included files *cannot*
28+
; include files themselves.
29+
30+
[include]
31+
files = /etc/supervisor/conf.d/*.conf
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
3+
service ssh start
4+
echo "started ssh"
5+
6+
$HADOOP_HOME/sbin/start-all.sh
7+
echo "started hadoop"
8+
9+
$SPARK_HOME/sbin/start-all.sh
10+
echo "started spark"
11+
12+
$SPARK_HOME/sbin/start-history-server.sh
13+
echo "started spark history"
14+
15+
# pyspark --master spark://localhost:7077 > /tmp/jupyter.log 2>&1 &
16+
# options: https://gerardnico.com/db/spark/pyspark/pyspark
17+
$SPARK_HOME/bin/pyspark \
18+
--packages graphframes:graphframes:0.8.1-spark2.4-s_2.11 \
19+
--repositories https://repos.spark-packages.org \
20+
--master $PYSPARK_MASTER > /tmp/jupyter.log 2>&1 &
21+
echo "started pyspark"
22+
23+
if [ -d "/root/ipynb/data" ]; then
24+
for entry in /root/ipynb/data/*
25+
do
26+
hdfs dfs -copyFromLocal -f $entry /$(basename $entry)
27+
echo "copied $entry to hdfs"
28+
done
29+
else
30+
echo "/root/ipynb/data does not exists"
31+
fi
32+
33+
echo "done!"
34+
35+
exit 0

0 commit comments

Comments
 (0)