Skip to content

Commit fcf30cf

Browse files
committed
Update Hadoop and Python
1 parent 873903d commit fcf30cf

File tree

4 files changed

+62
-24
lines changed

4 files changed

+62
-24
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
*.tsv
22
*.csv
3+
*.py

Dockerfile

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,30 @@ FROM debian:jessie
22
MAINTAINER Getty Images "https://github.com/gettyimages"
33

44
RUN apt-get update \
5-
&& apt-get install -y curl net-tools unzip \
5+
&& apt-get install -y locales \
6+
&& dpkg-reconfigure -f noninteractive locales \
7+
&& locale-gen C.UTF-8 \
8+
&& /usr/sbin/update-locale LANG=C.UTF-8 \
9+
&& echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen \
10+
&& locale-gen \
11+
&& apt-get clean \
12+
&& rm -rf /var/lib/apt/lists/*
13+
14+
# Users with other locales should set this in their derivative image
15+
ENV LANG en_US.UTF-8
16+
ENV LANGUAGE en_US:en
17+
ENV LC_ALL en_US.UTF-8
18+
19+
RUN apt-get update \
20+
&& apt-get install -y curl unzip \
621
python3 python3-setuptools \
7-
python python-setuptools \
8-
&& easy_install3 pip py4j \
9-
&& easy_install pip py4j \
10-
&& apt-get clean \
11-
&& rm -rf /var/lib/apt/lists/*
22+
&& easy_install3 pip py4j \
23+
&& apt-get clean \
24+
&& rm -rf /var/lib/apt/lists/*
25+
26+
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
27+
ENV PYTHONHASHSEED 0
28+
ENV PYTHONIOENCODING UTF-8
1229

1330
# JAVA
1431
ENV JAVA_HOME /usr/jdk1.8.0_31
@@ -21,23 +38,30 @@ RUN curl -sL --retry 3 --insecure \
2138
&& ln -s $JAVA_HOME /usr/java \
2239
&& rm -rf $JAVA_HOME/man
2340

41+
# HADOOP
42+
ENV HADOOP_VERSION 2.6.3
43+
ENV HADOOP_HOME /usr/hadoop-$HADOOP_VERSION
44+
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
45+
ENV PATH $PATH:$HADOOP_HOME/bin
46+
RUN curl -sL --retry 3 \
47+
"http://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" \
48+
| gunzip \
49+
| tar -x -C /usr/ \
50+
&& rm -rf $HADOOP_HOME/share/doc
51+
2452
# SPARK
2553
ENV SPARK_VERSION 1.6.0
26-
ENV HADOOP_VERSION 2.6
27-
ENV SPARK_PACKAGE spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION
28-
ENV SPARK_HOME /usr/$SPARK_PACKAGE
54+
ENV SPARK_PACKAGE spark-$SPARK_VERSION-bin-without-hadoop
55+
ENV SPARK_HOME /usr/spark-$SPARK_VERSION
56+
ENV PYSPARK_PYTHON python3
57+
ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*"
2958
ENV PATH $PATH:$SPARK_HOME/bin
3059
RUN curl -sL --retry 3 \
3160
"http://d3kbcqa49mib13.cloudfront.net/$SPARK_PACKAGE.tgz" \
3261
| gunzip \
3362
| tar x -C /usr/ \
34-
&& ln -s $SPARK_HOME /usr/spark
35-
36-
# HADOOP/S3
37-
RUN curl -sL --retry 3 "http://central.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.6.0/hadoop-aws-2.6.0.jar" -o $SPARK_HOME/lib/hadoop-aws-2.6.0.jar \
38-
&& curl -sL --retry 3 "http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.14/aws-java-sdk-1.7.14.jar" -o $SPARK_HOME/lib/aws-java-sdk-1.7.14.jar \
39-
&& curl -sL --retry 3 "http://central.maven.org/maven2/com/google/collections/google-collections/1.0/google-collections-1.0.jar" -o $SPARK_HOME/lib/google-collections-1.0.jar \
40-
&& curl -sL --retry 3 "http://central.maven.org/maven2/joda-time/joda-time/2.8.2/joda-time-2.8.2.jar" -o $SPARK_HOME/lib/joda-time-2.8.2.jar
63+
&& mv /usr/$SPARK_PACKAGE $SPARK_HOME \
64+
&& rm -rf $SPARK_HOME/examples $SPARK_HOME/ec2
4165

4266
WORKDIR $SPARK_HOME
4367
CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"]

README.md

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,36 @@
33

44
A `debian:jessie` based [Spark](http://spark.apache.org) container. Use it in a standalone cluster with the accompanying `docker-compose.yml`, or as a base for more complex recipes.
55

6-
## example
6+
## docker example
7+
8+
To run `SparkPi`, run the image with Docker:
9+
10+
docker run --rm -it gettyimages/spark bin/run-example SparkPi 10
11+
12+
To start `spark-shell` with your AWS credentials:
13+
14+
docker run --rm -it -e "AWS_ACCESS_KEY_ID=YOURKEY" -e "AWS_SECRET_ACCESS_KEY=YOURSECRET" gettyimages/spark bin/spark-shell
15+
16+
To do a thing with Pyspark
17+
18+
echo "import pyspark\nprint(pyspark.SparkContext().parallelize(range(0, 10)).count())" > count.py
19+
docker run --rm -it -v $(pwd)/count.py:/count.py gettyimages/spark bin/pyspark /count.py
20+
21+
## docker-compose example
722

823
To create a standalone cluster with [docker-compose](http://docs.docker.com/compose):
924

1025
docker-compose up
1126

12-
The SparkUI will be running at `http://${YOUR_DOCKER_HOST}:8080` with one worker listed. To run `spark-shell`, exec into a container:
27+
The SparkUI will be running at `http://${YOUR_DOCKER_HOST}:8080` with one worker listed. To run `pyspark`, exec into a container:
1328

1429
docker exec -it dockerspark_master_1 /bin/bash
15-
/usr/spark/bin/spark-shell
30+
bin/pyspark
1631

1732
To run `SparkPi`, exec into a container:
1833

1934
docker exec -it dockerspark_master_1 /bin/bash
20-
/usr/spark/bin/run-example SparkPi 10
35+
bin/run-example SparkPi 10
2136

2237
## license
2338

docker-compose.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
master:
22
image: gettyimages/spark
3-
command: /usr/spark/bin/spark-class org.apache.spark.deploy.master.Master -h master
3+
command: bin/spark-class org.apache.spark.deploy.master.Master -h master
44
hostname: master
55
environment:
66
MASTER: spark://master:7077
77
SPARK_CONF_DIR: /conf
8-
PYSPARK_PYTHON: python3
98
expose:
109
- 7001
1110
- 7002
@@ -26,15 +25,14 @@ master:
2625

2726
worker:
2827
image: gettyimages/spark
29-
command: /usr/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
28+
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
3029
hostname: worker
3130
environment:
3231
SPARK_CONF_DIR: /conf
3332
SPARK_WORKER_CORES: 2
3433
SPARK_WORKER_MEMORY: 1g
3534
SPARK_WORKER_PORT: 8881
3635
SPARK_WORKER_WEBUI_PORT: 8081
37-
PYSPARK_PYTHON: python3
3836
links:
3937
- master
4038
expose:

0 commit comments

Comments
 (0)