From ff963b529603e683ee0cbd8ade54cb4496469f5b Mon Sep 17 00:00:00 2001 From: Kevin W Monroe Date: Thu, 1 Jun 2017 20:57:30 +0000 Subject: BIGTOP-2795: spark charm improvements Closes #229 --- bigtop-deploy/juju/hadoop-spark/README.md | 12 ++--- bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml | 6 ++- bigtop-deploy/juju/hadoop-spark/bundle-local.yaml | 6 ++- bigtop-deploy/juju/hadoop-spark/bundle.yaml | 6 ++- .../src/charm/spark/layer-spark/README.md | 60 ++++++++++++++++++---- .../src/charm/spark/layer-spark/actions.yaml | 34 +++++------- .../src/charm/spark/layer-spark/actions/list-jobs | 11 ++-- .../charm/spark/layer-spark/actions/spark-submit | 44 ++++++++++++++++ .../src/charm/spark/layer-spark/actions/submit | 59 +-------------------- .../src/charm/spark/layer-spark/layer.yaml | 13 +++-- .../layer-spark/lib/charms/layer/bigtop_spark.py | 58 ++++++++++++++------- .../src/charm/spark/layer-spark/scripts/sparkpi.sh | 7 ++- 12 files changed, 190 insertions(+), 126 deletions(-) create mode 100755 bigtop-packages/src/charm/spark/layer-spark/actions/spark-submit mode change 100755 => 120000 bigtop-packages/src/charm/spark/layer-spark/actions/submit mode change 100755 => 100644 bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py diff --git a/bigtop-deploy/juju/hadoop-spark/README.md b/bigtop-deploy/juju/hadoop-spark/README.md index 13905b3d..edd04634 100644 --- a/bigtop-deploy/juju/hadoop-spark/README.md +++ b/bigtop-deploy/juju/hadoop-spark/README.md @@ -23,16 +23,16 @@ using a simple programming model. Hadoop is designed to scale from a few servers to thousands of machines, each offering local computation and storage. Rather than rely on hardware to deliver high-availability, Hadoop can detect and handle failures at the -application layer. This provides a highly-available service on top of a cluster -of machines, each of which may be prone to failure. +application layer. This provides a highly-available service on top of a +cluster of machines, each of which may be prone to failure. Apache Spark is a fast and general engine for large-scale data processing. Learn more at [spark.apache.org][]. -This bundle provides a complete deployment of Hadoop and Spark components from -[Apache Bigtop][] that performs distributed data processing at scale. Ganglia -and rsyslog applications are also provided to monitor cluster health and syslog -activity. +This bundle provides a complete deployment of Hadoop and Spark components +from [Apache Bigtop][] that performs distributed data processing at scale. +Ganglia and rsyslog applications are also provided to monitor cluster health +and syslog activity. [spark.apache.org]: http://spark.apache.org/ [Apache Bigtop]: http://bigtop.apache.org/ diff --git a/bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml b/bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml index 66136c61..99678f13 100644 --- a/bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml +++ b/bigtop-deploy/juju/hadoop-spark/bundle-dev.yaml @@ -1,3 +1,8 @@ +series: xenial +description: > + This is a five unit big data cluster that includes Hadoop 2.7.3 and Spark 2.1 + from Apache Bigtop. Use it to analyse batch data with MapReduce or streaming + data with Spark. It will run on 5 machines in your cloud. services: namenode: charm: "cs:~bigdata-dev/xenial/hadoop-namenode" @@ -79,7 +84,6 @@ services: annotations: gui-x: "750" gui-y: "400" -series: xenial relations: - [resourcemanager, namenode] - [namenode, slave] diff --git a/bigtop-deploy/juju/hadoop-spark/bundle-local.yaml b/bigtop-deploy/juju/hadoop-spark/bundle-local.yaml index 6e08871d..297779a5 100644 --- a/bigtop-deploy/juju/hadoop-spark/bundle-local.yaml +++ b/bigtop-deploy/juju/hadoop-spark/bundle-local.yaml @@ -1,3 +1,8 @@ +series: xenial +description: > + This is a five unit big data cluster that includes Hadoop 2.7.3 and Spark 2.1 + from Apache Bigtop. Use it to analyse batch data with MapReduce or streaming + data with Spark. It will run on 5 machines in your cloud. services: namenode: charm: "/home/ubuntu/charms/xenial/hadoop-namenode" @@ -79,7 +84,6 @@ services: annotations: gui-x: "750" gui-y: "400" -series: xenial relations: - [resourcemanager, namenode] - [namenode, slave] diff --git a/bigtop-deploy/juju/hadoop-spark/bundle.yaml b/bigtop-deploy/juju/hadoop-spark/bundle.yaml index 9458b232..8b5d5a99 100644 --- a/bigtop-deploy/juju/hadoop-spark/bundle.yaml +++ b/bigtop-deploy/juju/hadoop-spark/bundle.yaml @@ -1,3 +1,8 @@ +series: xenial +description: > + This is a five unit big data cluster that includes Hadoop 2.7.3 and Spark 2.1 + from Apache Bigtop. Use it to analyse batch data with MapReduce or streaming + data with Spark. It will run on 5 machines in your cloud. services: namenode: charm: "cs:xenial/hadoop-namenode-19" @@ -79,7 +84,6 @@ services: annotations: gui-x: "750" gui-y: "400" -series: xenial relations: - [resourcemanager, namenode] - [namenode, slave] diff --git a/bigtop-packages/src/charm/spark/layer-spark/README.md b/bigtop-packages/src/charm/spark/layer-spark/README.md index c482ea93..a5ac2c0b 100644 --- a/bigtop-packages/src/charm/spark/layer-spark/README.md +++ b/bigtop-packages/src/charm/spark/layer-spark/README.md @@ -122,7 +122,7 @@ more information about a specific smoke test with: ## Spark Master web UI Spark provides a web console that can be used to verify information about -the cluster. To access it, find the `PUBLIC-ADDRESS` of the spark application +the cluster. To access it, find the `Public address` of the spark application and expose it: juju status spark @@ -142,10 +142,33 @@ address. The job history web interface will be available at the following URL: # Using -Once deployment is verified, Spark batch or streaming jobs can be run in a -variety of ways: +## Actions +Once Spark is ready, there are a number of actions available in this charm. -### Spark shell +Run a benchmark (as described in the **Benchmarking** section): + + juju run-action spark/0 pagerank + juju show-action-output # <-- id from above command + +Run a smoke test (as described in the **Verifying** section): + + juju run-action spark/0 smoke-test + juju show-action-output # <-- id from above command + +Start/Stop/Restart the Spark Job History service: + + juju run-action spark/0 [start|stop|restart]-spark-job-history-server + juju show-action-output # <-- id from above command + +Submit a Spark job: + + juju run-action spark/0 spark-submit \ + options='--class org.apache.spark.examples.SparkPi' \ + job='/usr/lib/spark/examples/jars/spark-examples.jar' \ + job-args='10' + juju show-action-output # <-- id from above command + +## Spark shell Spark shell provides a simple way to learn the API, as well as a powerful tool to analyze data interactively. It is available in either Scala or Python and can be run from the Spark unit as follows: @@ -154,14 +177,14 @@ and can be run from the Spark unit as follows: spark-shell # for interaction using scala pyspark # for interaction using python -### Command line +## Command line SSH to the Spark unit and manually run a spark-submit job, for example: juju ssh spark/0 spark-submit --class org.apache.spark.examples.SparkPi \ - --master yarn-client /usr/lib/spark/lib/spark-examples*.jar 10 + /usr/lib/spark/examples/jars/spark-examples.jar 10 -### Apache Zeppelin +## Apache Zeppelin Apache Zeppelin is a web-based notebook that enables interactive data analytics. Make beautiful data-driven, interactive, and collaborative documents with SQL, Scala and more. Deploy Zeppelin and relate it to Spark: @@ -169,7 +192,7 @@ with SQL, Scala and more. Deploy Zeppelin and relate it to Spark: juju deploy zeppelin juju add-relation spark zeppelin -To access the web console, find the `PUBLIC-ADDRESS` of the zeppelin +To access the web console, find the `Public address` of the zeppelin application and expose it: juju status zeppelin @@ -182,6 +205,25 @@ The web interface will be available at the following URL: # Configuring +Charm configuration can be changed at runtime with `juju config`. This charm +supports the following config parameters. + +## driver_memory +Amount of memory available for the Spark driver process (1g by default). +Set a different value with: + + juju config spark driver_memory=4096m + +## executor_memory +Amount of memory available for each Spark executor process (1g by default). +Set a different value with: + + juju config spark executor_memory=2g + +> **Note**: When Spark is in YARN mode, ensure the configured executor memory +does not exceed the NodeManager maximum (defined on each nodemanager as +`yarn.nodemanager.resource.memory-mb` in `yarn-default.xml`). + ## spark_bench_enabled Install the SparkBench benchmarking suite. If `true` (the default), this charm @@ -192,7 +234,7 @@ or `spark_bench_x86_64`, depending on the unit's architecture. Spark has four modes of execution: local, standalone, yarn-client, and yarn-cluster. The default mode is `standalone` and can be changed by setting -the `spark_execution_mode` config variable. +the `spark_execution_mode` config option. * **Local** diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions.yaml b/bigtop-packages/src/charm/spark/layer-spark/actions.yaml index 7f0961d8..d0be216c 100644 --- a/bigtop-packages/src/charm/spark/layer-spark/actions.yaml +++ b/bigtop-packages/src/charm/spark/layer-spark/actions.yaml @@ -46,40 +46,32 @@ start-spark-job-history-server: description: Start the Spark job history server. stop-spark-job-history-server: description: Stop the Spark job history server. -submit: - description: Submit a job to Spark. +spark-submit: + description: Submit a job to run with the 'spark-submit' utility. required: ['job'] params: job: description: > URL to a JAR or Python file. This can be any URL supported by - spark-submit, such as a remote URL, an hdfs:// path (if - connected to HDFS), etc. - type: string - class: - description: > - If a JAR is given, this should be the name of the class within - the JAR to run. + spark-submit, such as a local path, remote URL, an hdfs:// path + (if connected to HDFS), etc. type: string job-args: - description: Arguments for the job. - packages: - description: Comma-separated list of packages to include. - type: string - py-files: - description: Comma-separated list of Python packages to include. - type: string - extra-params: + description: Arguments required by the job. + options: description: > - Additional params to pass to spark-submit. - For example: "--executor-memory 1000M --supervise" + Options to pass to spark-submit. + For example, "--driver-memory 1000M --class org.foo.MyApp" type: string cron: description: > Schedule the job to be run periodically, according to the - given cron rule. For example: "*/5 * * * *" will run the - job every 5 minutes. + given cron rule. For example, "'*/5 * * * *'" will run the + job every 5 minutes (note the use of double and single quotes + is required to parse this value as a string). type: string +submit: + description: DEPRECATED, use the spark-submit action instead. list-jobs: description: List scheduled periodic jobs. remove-job: diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions/list-jobs b/bigtop-packages/src/charm/spark/layer-spark/actions/list-jobs index b6fdf182..31da9db5 100755 --- a/bigtop-packages/src/charm/spark/layer-spark/actions/list-jobs +++ b/bigtop-packages/src/charm/spark/layer-spark/actions/list-jobs @@ -15,10 +15,15 @@ # limitations under the License. set -e -for line in "$(crontab -lu ubuntu | grep '# mapreduce job: ')"; do +for line in "$(crontab -lu ubuntu | grep '# action: ')"; do if [[ -n "$line" ]]; then - action_id=$(echo "$line" | sed -e 's/.* # mapreduce job: //') - job_code=$(echo "$line" | sed -e 's/ # mapreduce job: .*//') + # the action uuid is after the 'action:' comment + action_id=$(echo "$line" | sed -e 's/.* # action: //') + + # the actual job is everything before the 'action:' comment + job_code=$(echo "$line" | sed -e 's/ # action: .*//') + + # show the people what they want action-set job.$action_id="$job_code" fi done diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions/spark-submit b/bigtop-packages/src/charm/spark/layer-spark/actions/spark-submit new file mode 100755 index 00000000..90bd68ef --- /dev/null +++ b/bigtop-packages/src/charm/spark/layer-spark/actions/spark-submit @@ -0,0 +1,44 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +if ! charms.reactive is_state 'spark.started'; then + action-set outcome="failure" + action-fail 'Spark not yet ready' + exit 1 +fi + +job="$(action-get job)" +job_args="$(action-get job-args)" +options="$(action-get options)" +cron="$(action-get cron)" + +job_code=". /etc/environment ; spark-submit ${options} ${job} ${job_args}" +action-set job-code="$job_code" + +if [[ -z "$cron" ]]; then + su ubuntu -c "$job_code" +else + juju-log "Scheduling job with ID $JUJU_ACTION_UUID" + action-set action-id="$JUJU_ACTION_UUID" + job_line="$cron $job_code # action: $JUJU_ACTION_UUID" + + # if we dont have a crontab, make an empty one + crontab -lu ubuntu > /dev/null || echo -n | crontab -u ubuntu - + + # append our job to then end of any existing crontab + (crontab -lu ubuntu; echo "$job_line") | crontab -u ubuntu - +fi diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions/submit b/bigtop-packages/src/charm/spark/layer-spark/actions/submit deleted file mode 100755 index ece0b206..00000000 --- a/bigtop-packages/src/charm/spark/layer-spark/actions/submit +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -if ! charms.reactive is_state 'spark.started'; then - action-set outcome="failure" - action-fail 'Spark not yet ready' - exit 1 -fi - -py_files="$(action-get py-files)" -packages="$(action-get packages)" -extra_params="$(action-get extra-params)" -class="$(action-get class)" -job="$(action-get job)" -job_args="$(action-get job-args)" -cron="$(action-get cron)" - -submit_args='--deploy-mode cluster' -if [[ -n "$packages" ]]; then - submit_args="$submit_args --packages $packages" -fi -if [[ -n "$py_files" ]]; then - submit_args="$submit_args --py-files $py_files" -fi -if [[ -n "$extra_params" ]]; then - submit_args="$submit_args $extra_params" -fi -if [[ -n "$class" ]]; then - submit_args="$submit_args --class $class" -fi -submit_args="$submit_args $job" - -job_code=". /etc/environment ; spark-submit ${submit_args} ${job_args}" -action-set job-code="$job_code" - -if [[ -z "$cron" ]]; then - su ubuntu -c "$job_code" -else - juju-log "Scheduling job with ID $JUJU_ACTION_UUID" - action-set action-id="$JUJU_ACTION_UUID" - job_line="$cron $job_code # $JUJU_ACTION_UUID" - crontab -lu ubuntu > /dev/null || echo -n | crontab -u ubuntu - - (crontab -lu ubuntu; echo "$job_line") | crontab -u ubuntu - -fi diff --git a/bigtop-packages/src/charm/spark/layer-spark/actions/submit b/bigtop-packages/src/charm/spark/layer-spark/actions/submit new file mode 120000 index 00000000..3604288b --- /dev/null +++ b/bigtop-packages/src/charm/spark/layer-spark/actions/submit @@ -0,0 +1 @@ +spark-submit \ No newline at end of file diff --git a/bigtop-packages/src/charm/spark/layer-spark/layer.yaml b/bigtop-packages/src/charm/spark/layer-spark/layer.yaml index 11fc1355..4ee763d3 100644 --- a/bigtop-packages/src/charm/spark/layer-spark/layer.yaml +++ b/bigtop-packages/src/charm/spark/layer-spark/layer.yaml @@ -21,12 +21,15 @@ options: path: '/var/log/spark/apps' ports: # Ports that need to be exposed, overridden, or manually specified. - # Only expose ports serving a UI or external API (i.e., namenode and - # resourcemanager). Communication among units within the cluster does - # not need ports to be explicitly opened. - spark-history: + # Only expose ports serving a UI or external API (e.g. spark history + # server). IPC among units within a deployment does not typically + # require a port to be exposed. + spark-history-ui: port: 18080 exposed_on: 'spark' - spark-webui: + spark-master-ui: port: 8080 exposed_on: 'spark' + spark-worker-ui: + port: 8081 + exposed_on: 'spark' diff --git a/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py b/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py old mode 100755 new mode 100644 index fa8b5e76..990c2903 --- a/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py +++ b/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py @@ -147,7 +147,6 @@ class Spark(object): self.dist_config.add_users() self.dist_config.add_dirs() self.install_demo() - self.open_ports() def setup_hdfs_logs(self): # create hdfs storage space for history server @@ -326,29 +325,50 @@ class Spark(object): Path(demo_target).chown('ubuntu', 'hadoop') def start(self): - # always start the history server, start master/worker if we're standalone + ''' + Always start the Spark History Server. Start other services as + required by our execution mode. Open related ports as appropriate. + ''' host.service_start('spark-history-server') + hookenv.open_port(self.dist_config.port('spark-history-ui')) + + # Spark master/worker is only started in standalone mode if hookenv.config()['spark_execution_mode'] == 'standalone': if host.service_start('spark-master'): - # If the master started, wait 2m for recovery before starting - # the worker. - hookenv.status_set('maintenance', - 'waiting for spark master recovery') - hookenv.log("Waiting 2m to ensure spark master is ALIVE") - time.sleep(120) + hookenv.log("Spark Master started") + hookenv.open_port(self.dist_config.port('spark-master-ui')) + # If the master started and we have peers, wait 2m for recovery + # before starting the worker. This ensures the worker binds + # to the correct master. + if unitdata.kv().get('sparkpeer.units'): + hookenv.status_set('maintenance', + 'waiting for spark master recovery') + hookenv.log("Waiting 2m to ensure spark master is ALIVE") + time.sleep(120) else: - hookenv.log("Master did not start") - host.service_start('spark-worker') + hookenv.log("Spark Master did not start; this is normal " + "for non-leader units in standalone mode") + + # NB: Start the worker even if the master process on this unit + # fails to start. In non-HA mode, spark master only runs on the + # leader. On non-leader units, we still want a worker bound to + # the leader. + if host.service_start('spark-worker'): + hookenv.log("Spark Worker started") + hookenv.open_port(self.dist_config.port('spark-worker-ui')) + else: + hookenv.log("Spark Worker did not start") def stop(self): + ''' + Stop all services (and close associated ports). Stopping a service + that is not currently running does no harm. + ''' host.service_stop('spark-history-server') - host.service_stop('spark-master') - host.service_stop('spark-worker') - - def open_ports(self): - for port in self.dist_config.exposed_ports('spark'): - hookenv.open_port(port) + hookenv.close_port(self.dist_config.port('spark-history-ui')) - def close_ports(self): - for port in self.dist_config.exposed_ports('spark'): - hookenv.close_port(port) + # Stop the worker before the master + host.service_stop('spark-worker') + hookenv.close_port(self.dist_config.port('spark-worker-ui')) + host.service_stop('spark-master') + hookenv.close_port(self.dist_config.port('spark-master-ui')) diff --git a/bigtop-packages/src/charm/spark/layer-spark/scripts/sparkpi.sh b/bigtop-packages/src/charm/spark/layer-spark/scripts/sparkpi.sh index 3a345496..d003bbe8 100755 --- a/bigtop-packages/src/charm/spark/layer-spark/scripts/sparkpi.sh +++ b/bigtop-packages/src/charm/spark/layer-spark/scripts/sparkpi.sh @@ -15,6 +15,9 @@ # limitations under the License. set -eu +JAR_PATH=`find /usr/lib/spark -name spark-examples.jar` +CMD="spark-submit --class org.apache.spark.examples.SparkPi ${JAR_PATH} 10" + echo "Running SparkPi" -spark-submit --class org.apache.spark.examples.SparkPi /usr/lib/spark/lib/spark-examples-*.jar 10 -echo "" +echo "Command: ${CMD}" +${CMD} 2>&1 | grep "Pi is" -- cgit v1.2.3