specify spark.yarn.jar in favor of ENV SPARK_JAR, prettify README, setup empty metrics.properties

lukeforehand · lukeforehand · commit 5ccab3fa2953 · 2015-06-14T13:31:58.000-05:00
diff --git a/Dockerfile b/Dockerfile
@@ -11,7 +11,6 @@ ADD yarn-remote-client $SPARK_HOME/yarn-remote-client
 RUN $BOOTSTRAP && $HADOOP_PREFIX/bin/hadoop dfsadmin -safemode leave && $HADOOP_PREFIX/bin/hdfs dfs -put $SPARK_HOME-1.4.0-bin-hadoop2.6/lib /spark
 
 ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
-ENV SPARK_JAR hdfs:///spark/spark-assembly-1.4.0-hadoop2.6.0.jar
 ENV PATH $PATH:$SPARK_HOME/bin:$HADOOP_PREFIX/bin
 # update boot script
 COPY bootstrap.sh /etc/bootstrap.sh
diff --git a/README.md b/README.md
@@ -18,9 +18,13 @@ docker build --rm -t sequenceiq/spark:1.4.0 .
 
 * if using boot2docker make sure your VM has more than 2GB memory
 * in your /etc/hosts file add $(boot2docker ip) as host 'sandbox' to make it easier to access your sandbox UI
-* open UI ports when starting image
+* open yarn UI ports when running container
 ```
-docker run -it -p 8088:8088 -p 8042:8042 -p 8080:8080 -p 7077:7077 -p 6066:6066 -h sandbox sequenceiq/spark:1.4.0 bash
+docker run -it -p 8088:8088 -p 8042:8042 -h sandbox sequenceiq/spark:1.4.0 bash
+```
+or
+```
+docker run -d -h sandbox sequenceiq/spark:1.3.1 -d
 ```
 
 ## Versions
@@ -38,8 +42,11 @@ In yarn-client mode, the driver runs in the client process, and the application
 
 ```
 # run the spark shell
-# set memory limits appropriately or spark context will not be able to start
-spark-shell --master yarn-client --driver-memory 1g --executor-memory 1g --executor-cores 1
+spark-shell \
+--master yarn-client \
+--driver-memory 1g \
+--executor-memory 1g \
+--executor-cores 1
 
 # execute the the following command which should return 1000
 scala> sc.parallelize(1 to 1000).count()
@@ -52,12 +59,26 @@ Estimating Pi (yarn-cluster mode):
 
 ```
 # execute the the following command which should write the "Pi is roughly 3.1418" into the logs
-spark-submit --class org.apache.spark.examples.SparkPi --master yarn-cluster --driver-memory 1g --executor-memory 1g --executor-cores 1 $SPARK_HOME/lib/spark-examples-1.4.0-hadoop2.6.0.jar
+# note you must specify --files argument in cluster mode to enable metrics
+spark-submit \
+--class org.apache.spark.examples.SparkPi \
+--files $SPARK_HOME/conf/metrics.properties \
+--master yarn-cluster \
+--driver-memory 1g \
+--executor-memory 1g \
+--executor-cores 1 \
+$SPARK_HOME/lib/spark-examples-1.4.0-hadoop2.6.0.jar
 ```
 
 Estimating Pi (yarn-client mode):
 
 ```
 # execute the the following command which should print the "Pi is roughly 3.1418" to the screen
-spark-submit --class org.apache.spark.examples.SparkPi --master yarn-client --driver-memory 1g --executor-memory 1g --executor-cores 1 $SPARK_HOME/lib/spark-examples-1.4.0-hadoop2.6.0.jar
+spark-submit \
+--class org.apache.spark.examples.SparkPi \
+--master yarn-client \
+--driver-memory 1g \
+--executor-memory 1g \
+--executor-cores 1 \
+$SPARK_HOME/lib/spark-examples-1.4.0-hadoop2.6.0.jar
 ```
diff --git a/bootstrap.sh b/bootstrap.sh
@@ -12,6 +12,9 @@ cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do  echo == $cp;
 # altering the core-site configuration
 sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
 
+# setting spark defaults
+echo spark.yarn.jar hdfs:///spark/spark-assembly-1.4.0-hadoop2.6.0.jar > $SPARK_HOME/conf/spark-defaults.conf
+cp $SPARK_HOME/conf/metrics.properties.template $SPARK_HOME/conf/metrics.properties
 
 service sshd start
 $HADOOP_PREFIX/sbin/start-dfs.sh