BIGTOP-2795: spark charm improvements

Closes #229
author: Kevin W Monroe <kevin.monroe@canonical.com> 2017-06-01 20:57:30 +0000
committer: Kevin W Monroe <kevin.monroe@canonical.com> 2017-06-04 23:42:18 -0500
commit: ff963b529603e683ee0cbd8ade54cb4496469f5b (patch)
tree: 55cef657ded5200bd91cb1ea2d5f3a3fc64fc292
parent: 32722ac3886c07fabd2b4c16b6fa9d93443893ab (diff)
12 files changed, 190 insertions, 126 deletions
diff --git a/bigtop-deploy/juju/hadoop-spark/README.md b/bigtop-deploy/juju/hadoop-spark/README.md
index 13905b3d..edd04634 100644
--- a/bigtop-deploy/juju/hadoop-spark/README.md
+++ b/bigtop-deploy/juju/hadoop-spark/README.md
@@ -23,16 +23,16 @@ using a simple programming model.
 Hadoop is designed to scale from a few servers to thousands of machines,
 each offering local computation and storage. Rather than rely on hardware
 to deliver high-availability, Hadoop can detect and handle failures at the
-application layer. This provides a highly-available service on top of a cluster
-of machines, each of which may be prone to failure.
+application layer. This provides a highly-available service on top of a
+cluster of machines, each of which may be prone to failure.
 
 Apache Spark is a fast and general engine for large-scale data processing.
 Learn more at [spark.apache.org][].
 
-This bundle provides a complete deployment of Hadoop and Spark components from
-[Apache Bigtop][] that performs distributed data processing at scale. Ganglia
-and rsyslog applications are also provided to monitor cluster health and syslog
-activity.
+This bundle provides a complete deployment of Hadoop and Spark components
+from [Apache Bigtop][] that performs distributed data processing at scale.
+Ganglia and rsyslog applications are also provided to monitor cluster health
+and syslog activity.
 
 [spark.apache.org]: http://spark.apache.org/
 [Apache Bigtop]: http://bigtop.apache.org/
diff --git a/bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml b/bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml
index 66136c61..99678f13 100644
--- a/bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml
+++ b/bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml
@@ -1,3 +1,8 @@
+series: xenial
+description: >
+  This is a five unit big data cluster that includes Hadoop 2.7.3 and Spark 2.1
+  from Apache Bigtop. Use it to analyse batch data with MapReduce or streaming
+  data with Spark. It will run on 5 machines in your cloud.
 services:
   namenode:
     charm: "cs:~bigdata-dev/xenial/hadoop-namenode"
@@ -79,7 +84,6 @@ services:
     annotations:
       gui-x: "750"
       gui-y: "400"
-series: xenial
 relations:
   - [resourcemanager, namenode]
   - [namenode, slave]
diff --git a/bigtop-deploy/juju/hadoop-spark/bundle-local.yaml b/bigtop-deploy/juju/hadoop-spark/bundle-local.yaml
index 6e08871d..297779a5 100644
--- a/bigtop-deploy/juju/hadoop-spark/bundle-local.yaml
+++ b/bigtop-deploy/juju/hadoop-spark/bundle-local.yaml
@@ -1,3 +1,8 @@
+series: xenial
+description: >
+  This is a five unit big data cluster that includes Hadoop 2.7.3 and Spark 2.1
+  from Apache Bigtop. Use it to analyse batch data with MapReduce or streaming
+  data with Spark. It will run on 5 machines in your cloud.
 services:
   namenode:
     charm: "/home/ubuntu/charms/xenial/hadoop-namenode"
@@ -79,7 +84,6 @@ services:
     annotations:
       gui-x: "750"
       gui-y: "400"
-series: xenial
 relations:
   - [resourcemanager, namenode]
   - [namenode, slave]
diff --git a/bigtop-deploy/juju/hadoop-spark/bundle.yaml b/bigtop-deploy/juju/hadoop-spark/bundle.yaml
index 9458b232..8b5d5a99 100644
--- a/bigtop-deploy/juju/hadoop-spark/bundle.yaml
+++ b/bigtop-deploy/juju/hadoop-spark/bundle.yaml
@@ -1,3 +1,8 @@
+series: xenial
+description: >
+  This is a five unit big data cluster that includes Hadoop 2.7.3 and Spark 2.1
+  from Apache Bigtop. Use it to analyse batch data with MapReduce or streaming
+  data with Spark. It will run on 5 machines in your cloud.
 services:
   namenode:
     charm: "cs:xenial/hadoop-namenode-19"
@@ -79,7 +84,6 @@ services:
     annotations:
       gui-x: "750"
       gui-y: "400"
-series: xenial
 relations:
   - [resourcemanager, namenode]
   - [namenode, slave]
diff --git a/bigtop-packages/src/charm/spark/layer-spark/README.md b/bigtop-packages/src/charm/spark/layer-spark/README.md
index c482ea93..a5ac2c0b 100644
--- a/bigtop-packages/src/charm/spark/layer-spark/README.md
+++ b/bigtop-packages/src/charm/spark/layer-spark/README.md
@@ -122,7 +122,7 @@ more information about a specific smoke test with:
 
 ## Spark Master web UI
 Spark provides a web console that can be used to verify information about
-the cluster. To access it, find the `PUBLIC-ADDRESS` of the spark application
+the cluster. To access it, find the `Public address` of the spark application
 and expose it:
 
     juju status spark
@@ -142,10 +142,33 @@ address. The job history web interface will be available at the following URL:
 
 # Using
 
-Once deployment is verified, Spark batch or streaming jobs can be run in a
-variety of ways:
+## Actions
+Once Spark is ready, there are a number of actions available in this charm.
 
-### Spark shell
+Run a benchmark (as described in the **Benchmarking** section):
+
+    juju run-action spark/0 pagerank
+    juju show-action-output <id>  # <-- id from above command
+
+Run a smoke test (as described in the **Verifying** section):
+
+    juju run-action spark/0 smoke-test
+    juju show-action-output <id>  # <-- id from above command
+
+Start/Stop/Restart the Spark Job History service:
+
+    juju run-action spark/0 [start|stop|restart]-spark-job-history-server
+    juju show-action-output <id>  # <-- id from above command
+
+Submit a Spark job:
+
+    juju run-action spark/0 spark-submit \
+      options='--class org.apache.spark.examples.SparkPi' \
+      job='/usr/lib/spark/examples/jars/spark-examples.jar' \
+      job-args='10'
+    juju show-action-output <id>  # <-- id from above command
+
+## Spark shell
 Spark shell provides a simple way to learn the API, as well as a powerful
 tool to analyze data interactively. It is available in either Scala or Python
 and can be run from the Spark unit as follows:
@@ -154,14 +177,14 @@ and can be run from the Spark unit as follows:
     spark-shell # for interaction using scala
     pyspark     # for interaction using python
 
-### Command line
+## Command line
 SSH to the Spark unit and manually run a spark-submit job, for example:
 
     juju ssh spark/0
     spark-submit --class org.apache.spark.examples.SparkPi \
-     --master yarn-client /usr/lib/spark/lib/spark-examples*.jar 10
+     /usr/lib/spark/examples/jars/spark-examples.jar 10
 
-### Apache Zeppelin
+## Apache Zeppelin
 Apache Zeppelin is a web-based notebook that enables interactive data
 analytics. Make beautiful data-driven, interactive, and collaborative documents
 with SQL, Scala and more. Deploy Zeppelin and relate it to Spark:
@@ -169,7 +192,7 @@ with SQL, Scala and more. Deploy Zeppelin and relate it to Spark:
     juju deploy zeppelin
     juju add-relation spark zeppelin
 
-To access the web console, find the `PUBLIC-ADDRESS` of the zeppelin
+To access the web console, find the `Public address` of the zeppelin
 application and expose it:
 
     juju status zeppelin
@@ -182,6 +205,25 @@ The web interface will be available at the following URL:
 
 # Configuring
 
+Charm configuration can be changed at runtime with `juju config`. This charm
+supports the following config parameters.
+
+## driver_memory
+Amount of memory available for the Spark driver process (1g by default).
+Set a different value with:
+
+    juju config spark driver_memory=4096m
+
+## executor_memory
+Amount of memory available for each Spark executor process (1g by default).
+Set a different value with:
+
+    juju config spark executor_memory=2g
+
+> **Note**: When Spark is in YARN mode, ensure the configured executor memory
+does not exceed the NodeManager maximum (defined on each nodemanager as
+`yarn.nodemanager.resource.memory-mb` in `yarn-default.xml`).
+
 ## spark_bench_enabled
 
 Install the SparkBench benchmarking suite. If `true` (the default), this charm
@@ -192,7 +234,7 @@ or `spark_bench_x86_64`, depending on the unit's architecture.
 
 Spark has four modes of execution: local, standalone, yarn-client, and
 yarn-cluster. The default mode is `standalone` and can be changed by setting
-the `spark_execution_mode` config variable.
+the `spark_execution_mode` config option.
 
   * **Local**
 
diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions.yaml b/bigtop-packages/src/charm/spark/layer-spark/actions.yaml
index 7f0961d8..d0be216c 100644
--- a/bigtop-packages/src/charm/spark/layer-spark/actions.yaml
+++ b/bigtop-packages/src/charm/spark/layer-spark/actions.yaml
@@ -46,40 +46,32 @@ start-spark-job-history-server:
     description: Start the Spark job history server.
 stop-spark-job-history-server:
     description: Stop the Spark job history server.
-submit:
-    description: Submit a job to Spark.
+spark-submit:
+    description: Submit a job to run with the 'spark-submit' utility.
     required: ['job']
     params:
         job:
             description: >
                 URL to a JAR or Python file.  This can be any URL supported by
-                spark-submit, such as a remote URL, an hdfs:// path (if
-                connected to HDFS), etc.
-            type: string
-        class:
-            description: >
-                If a JAR is given, this should be the name of the class within
-                the JAR to run.
+                spark-submit, such as a local path, remote URL, an hdfs:// path
+                (if connected to HDFS), etc.
             type: string
         job-args:
-            description: Arguments for the job.
-        packages:
-            description: Comma-separated list of packages to include.
-            type: string
-        py-files:
-            description: Comma-separated list of Python packages to include.
-            type: string
-        extra-params:
+            description: Arguments required by the job.
+        options:
             description: >
-                Additional params to pass to spark-submit.
-                For example: "--executor-memory 1000M --supervise"
+                Options to pass to spark-submit.
+                For example, "--driver-memory 1000M --class org.foo.MyApp"
             type: string
         cron:
             description: >
                 Schedule the job to be run periodically, according to the
-                given cron rule.  For example: "*/5 * * * *" will run the
-                job every 5 minutes.
+                given cron rule.  For example, "'*/5 * * * *'" will run the
+                job every 5 minutes (note the use of double and single quotes
+                is required to parse this value as a string).
             type: string
+submit:
+    description: DEPRECATED, use the spark-submit action instead.
 list-jobs:
     description: List scheduled periodic jobs.
 remove-job:
diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions/list-jobs b/bigtop-packages/src/charm/spark/layer-spark/actions/list-jobs
index b6fdf182..31da9db5 100755
--- a/bigtop-packages/src/charm/spark/layer-spark/actions/list-jobs
+++ b/bigtop-packages/src/charm/spark/layer-spark/actions/list-jobs
@@ -15,10 +15,15 @@
 # limitations under the License.
 set -e
 
-for line in "$(crontab -lu ubuntu | grep '# mapreduce job: ')"; do
+for line in "$(crontab -lu ubuntu | grep '# action: ')"; do
     if [[ -n "$line" ]]; then
-        action_id=$(echo "$line" | sed -e 's/.*  # mapreduce job: //')
-        job_code=$(echo "$line" | sed -e 's/  # mapreduce job: .*//')
+        # the action uuid is after the 'action:' comment
+        action_id=$(echo "$line" | sed -e 's/.* # action: //')
+
+        # the actual job is everything before the 'action:' comment
+        job_code=$(echo "$line" | sed -e 's/ # action: .*//')
+
+        # show the people what they want
         action-set job.$action_id="$job_code"
     fi
 done
diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions/spark-submit b/bigtop-packages/src/charm/spark/layer-spark/actions/spark-submit
new file mode 100755
index 00000000..90bd68ef
--- /dev/null
+++ b/bigtop-packages/src/charm/spark/layer-spark/actions/spark-submit
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+if ! charms.reactive is_state 'spark.started'; then
+    action-set outcome="failure"
+    action-fail 'Spark not yet ready'
+    exit 1
+fi
+
+job="$(action-get job)"
+job_args="$(action-get job-args)"
+options="$(action-get options)"
+cron="$(action-get cron)"
+
+job_code=". /etc/environment ; spark-submit ${options} ${job} ${job_args}"
+action-set job-code="$job_code"
+
+if [[ -z "$cron" ]]; then
+    su ubuntu -c "$job_code"
+else
+    juju-log "Scheduling job with ID $JUJU_ACTION_UUID"
+    action-set action-id="$JUJU_ACTION_UUID"
+    job_line="$cron $job_code # action: $JUJU_ACTION_UUID"
+
+    # if we dont have a crontab, make an empty one
+    crontab -lu ubuntu > /dev/null || echo -n | crontab -u ubuntu -
+
+    # append our job to then end of any existing crontab
+    (crontab -lu ubuntu; echo "$job_line") | crontab -u ubuntu -
+fi
diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions/submit b/bigtop-packages/src/charm/spark/layer-spark/actions/submit
index ece0b206..3604288b 100755..120000
--- a/bigtop-packages/src/charm/spark/layer-spark/actions/submit
+++ b/bigtop-packages/src/charm/spark/layer-spark/actions/submit
@@ -1,58 +1 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-if ! charms.reactive is_state 'spark.started'; then
-    action-set outcome="failure"
-    action-fail 'Spark not yet ready'
-    exit 1
-fi
-
-py_files="$(action-get py-files)"
-packages="$(action-get packages)"
-extra_params="$(action-get extra-params)"
-class="$(action-get class)"
-job="$(action-get job)"
-job_args="$(action-get job-args)"
-cron="$(action-get cron)"
-
-submit_args='--deploy-mode cluster'
-if [[ -n "$packages" ]]; then
-    submit_args="$submit_args --packages $packages"
-fi
-if [[ -n "$py_files" ]]; then
-    submit_args="$submit_args --py-files $py_files"
-fi
-if [[ -n "$extra_params" ]]; then
-    submit_args="$submit_args $extra_params"
-fi
-if [[ -n "$class" ]]; then
-    submit_args="$submit_args --class $class"
-fi
-submit_args="$submit_args $job"
-
-job_code=". /etc/environment ; spark-submit ${submit_args} ${job_args}"
-action-set job-code="$job_code"
-
-if [[ -z "$cron" ]]; then
-    su ubuntu -c "$job_code"
-else
-    juju-log "Scheduling job with ID $JUJU_ACTION_UUID"
-    action-set action-id="$JUJU_ACTION_UUID"
-    job_line="$cron $job_code # $JUJU_ACTION_UUID"
-    crontab -lu ubuntu > /dev/null || echo -n | crontab -u ubuntu -
-    (crontab -lu ubuntu; echo "$job_line") | crontab -u ubuntu -
-fi
+spark-submit
+\ No newline at end of file
diff --git a/bigtop-packages/src/charm/spark/layer-spark/layer.yaml b/bigtop-packages/src/charm/spark/layer-spark/layer.yaml
index 11fc1355..4ee763d3 100644
--- a/bigtop-packages/src/charm/spark/layer-spark/layer.yaml
+++ b/bigtop-packages/src/charm/spark/layer-spark/layer.yaml
@@ -21,12 +21,15 @@ options:
         path: '/var/log/spark/apps'
     ports:
       # Ports that need to be exposed, overridden, or manually specified.
-      # Only expose ports serving a UI or external API (i.e., namenode and
-      # resourcemanager).  Communication among units within the cluster does
-      # not need ports to be explicitly opened.
-      spark-history:
+      # Only expose ports serving a UI or external API (e.g. spark history
+      # server). IPC among units within a deployment does not typically
+      # require a port to be exposed.
+      spark-history-ui:
         port: 18080
         exposed_on: 'spark'
-      spark-webui:
+      spark-master-ui:
         port: 8080
         exposed_on: 'spark'
+      spark-worker-ui:
+        port: 8081
+        exposed_on: 'spark'
diff --git a/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py b/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py
index fa8b5e76..990c2903 100755..100644
--- a/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py
+++ b/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py
@@ -147,7 +147,6 @@ class Spark(object):
         self.dist_config.add_users()
         self.dist_config.add_dirs()
         self.install_demo()
-        self.open_ports()
 
     def setup_hdfs_logs(self):
         # create hdfs storage space for history server
@@ -326,29 +325,50 @@ class Spark(object):
         Path(demo_target).chown('ubuntu', 'hadoop')
 
     def start(self):
-        # always start the history server, start master/worker if we're standalone
+        '''
+        Always start the Spark History Server. Start other services as
+        required by our execution mode. Open related ports as appropriate.
+        '''
         host.service_start('spark-history-server')
+        hookenv.open_port(self.dist_config.port('spark-history-ui'))
+
+        # Spark master/worker is only started in standalone mode
         if hookenv.config()['spark_execution_mode'] == 'standalone':
             if host.service_start('spark-master'):
-                # If the master started, wait 2m for recovery before starting
-                # the worker.
-                hookenv.status_set('maintenance',
-                                   'waiting for spark master recovery')
-                hookenv.log("Waiting 2m to ensure spark master is ALIVE")
-                time.sleep(120)
+                hookenv.log("Spark Master started")
+                hookenv.open_port(self.dist_config.port('spark-master-ui'))
+                # If the master started and we have peers, wait 2m for recovery
+                # before starting the worker. This ensures the worker binds
+                # to the correct master.
+                if unitdata.kv().get('sparkpeer.units'):
+                    hookenv.status_set('maintenance',
+                                       'waiting for spark master recovery')
+                    hookenv.log("Waiting 2m to ensure spark master is ALIVE")
+                    time.sleep(120)
             else:
-                hookenv.log("Master did not start")
-            host.service_start('spark-worker')
+                hookenv.log("Spark Master did not start; this is normal "
+                            "for non-leader units in standalone mode")
+
+            # NB: Start the worker even if the master process on this unit
+            # fails to start. In non-HA mode, spark master only runs on the
+            # leader. On non-leader units, we still want a worker bound to
+            # the leader.
+            if host.service_start('spark-worker'):
+                hookenv.log("Spark Worker started")
+                hookenv.open_port(self.dist_config.port('spark-worker-ui'))
+            else:
+                hookenv.log("Spark Worker did not start")
 
     def stop(self):
+        '''
+        Stop all services (and close associated ports). Stopping a service
+        that is not currently running does no harm.
+        '''
         host.service_stop('spark-history-server')
-        host.service_stop('spark-master')
-        host.service_stop('spark-worker')
-
-    def open_ports(self):
-        for port in self.dist_config.exposed_ports('spark'):
-            hookenv.open_port(port)
+        hookenv.close_port(self.dist_config.port('spark-history-ui'))
 
-    def close_ports(self):
-        for port in self.dist_config.exposed_ports('spark'):
-            hookenv.close_port(port)
+        # Stop the worker before the master
+        host.service_stop('spark-worker')
+        hookenv.close_port(self.dist_config.port('spark-worker-ui'))
+        host.service_stop('spark-master')
+        hookenv.close_port(self.dist_config.port('spark-master-ui'))
diff --git a/bigtop-packages/src/charm/spark/layer-spark/scripts/sparkpi.sh b/bigtop-packages/src/charm/spark/layer-spark/scripts/sparkpi.sh
index 3a345496..d003bbe8 100755
--- a/bigtop-packages/src/charm/spark/layer-spark/scripts/sparkpi.sh
+++ b/bigtop-packages/src/charm/spark/layer-spark/scripts/sparkpi.sh
@@ -15,6 +15,9 @@
 # limitations under the License.
 set -eu
 
+JAR_PATH=`find /usr/lib/spark -name spark-examples.jar`
+CMD="spark-submit --class org.apache.spark.examples.SparkPi ${JAR_PATH} 10"
+
 echo "Running SparkPi"
-spark-submit --class org.apache.spark.examples.SparkPi /usr/lib/spark/lib/spark-examples-*.jar 10
-echo ""
+echo "Command: ${CMD}"
+${CMD} 2>&1 | grep "Pi is"
author	Kevin W Monroe <kevin.monroe@canonical.com>	2017-06-01 20:57:30 +0000
committer	Kevin W Monroe <kevin.monroe@canonical.com>	2017-06-04 23:42:18 -0500
commit	ff963b529603e683ee0cbd8ade54cb4496469f5b (patch)
tree	55cef657ded5200bd91cb1ea2d5f3a3fc64fc292
parent	32722ac3886c07fabd2b4c16b6fa9d93443893ab (diff)