aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin W Monroe <kevin.monroe@canonical.com>2017-06-07 16:21:50 +0000
committerKevin W Monroe <kevin.monroe@canonical.com>2017-06-28 12:00:08 -0500
commitb12aa51067b323fb0001aa7f57d9170b643fe073 (patch)
tree5bdf0056b4d6e62ff8d902ec8fb678e697aaa344
parente14693842b0ccadcbed37a08f6a1c4077ee8a406 (diff)
BIGTOP-2822: spark charm: leverage puppet config, gpu enablement
Closes #239
-rw-r--r--bigtop-packages/src/charm/spark/layer-spark/README.md17
-rw-r--r--bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py89
-rw-r--r--bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py61
3 files changed, 116 insertions, 51 deletions
diff --git a/bigtop-packages/src/charm/spark/layer-spark/README.md b/bigtop-packages/src/charm/spark/layer-spark/README.md
index a5ac2c0b..f7feaab7 100644
--- a/bigtop-packages/src/charm/spark/layer-spark/README.md
+++ b/bigtop-packages/src/charm/spark/layer-spark/README.md
@@ -62,7 +62,7 @@ the unit acting as master, query Zookeeper as follows:
juju run --unit zookeeper/0 'echo "get /spark/master_status" | /usr/lib/zookeeper/bin/zkCli.sh'
-### YARN-client and YARN-cluster
+### YARN
This charm leverages our pluggable Hadoop model with the `hadoop-plugin`
interface. This means that this charm can be related to an Apache Hadoop
cluster to run Spark jobs there. The suggested deployment method is to use the
@@ -224,14 +224,25 @@ Set a different value with:
does not exceed the NodeManager maximum (defined on each nodemanager as
`yarn.nodemanager.resource.memory-mb` in `yarn-default.xml`).
-## spark_bench_enabled
+## install-cuda
+Provided by `layer-nvidia-cuda`, this option controls the installation
+of NVIDIA CUDA packages if capable GPU hardware is present. When `false` (the
+default), CUDA will not be installed or configured regardless of hardware
+support. Set this to `true` to fetch and install CUDA-related packages from
+the NVIDIA developer repository.
+
+ juju config spark install-cuda=true
+
+> **Note**: This option requires external network access to
+http://developer.download.nvidia.com/. Ensure appropriate proxies are
+configured if needed.
+## spark_bench_enabled
Install the SparkBench benchmarking suite. If `true` (the default), this charm
will download spark bench from the URL specified by `spark_bench_ppc64le`
or `spark_bench_x86_64`, depending on the unit's architecture.
## spark_execution_mode
-
Spark has four modes of execution: local, standalone, yarn-client, and
yarn-cluster. The default mode is `standalone` and can be changed by setting
the `spark_execution_mode` config option.
diff --git a/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py b/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py
index 990c2903..91fcbf79 100644
--- a/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py
+++ b/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py
@@ -36,7 +36,7 @@ class Spark(object):
mode = hookenv.config()['spark_execution_mode']
zk_units = unitdata.kv().get('zookeeper.units', [])
master = None
- if mode.startswith('local') or mode == 'yarn-cluster':
+ if mode.startswith('local') or mode.startswith('yarn'):
master = mode
elif mode == 'standalone' and not zk_units:
master = 'spark://{}:7077'.format(spark_master_host)
@@ -47,8 +47,6 @@ class Spark(object):
nodes.append('{}:7077'.format(ip))
nodes_str = ','.join(nodes)
master = 'spark://{}'.format(nodes_str)
- elif mode.startswith('yarn'):
- master = 'yarn-client'
return master
def install_benchmark(self):
@@ -149,7 +147,8 @@ class Spark(object):
self.install_demo()
def setup_hdfs_logs(self):
- # create hdfs storage space for history server
+ # Create hdfs storage space for history server and return the name
+ # of the created directory.
dc = self.dist_config
events_dir = dc.path('spark_events')
events_dir = 'hdfs://{}'.format(events_dir)
@@ -159,18 +158,14 @@ class Spark(object):
events_dir)
return events_dir
- def configure(self, available_hosts, zk_units, peers):
+ def configure(self, available_hosts, zk_units, peers, extra_libs):
"""
This is the core logic of setting up spark.
- Two flags are needed:
-
- * Namenode exists aka HDFS is ready
- * Resource manager exists aka YARN is ready
-
- both flags are infered from the available hosts.
-
:param dict available_hosts: Hosts that Spark should know about.
+ :param list zk_units: List of Zookeeper dicts with host/port info.
+ :param list peers: List of Spark peer tuples (unit name, IP).
+ :param list extra_libs: List of extra lib paths for driver/executors.
"""
# Bootstrap spark
if not unitdata.kv().get('spark.bootstrapped', False):
@@ -188,6 +183,33 @@ class Spark(object):
mode = hookenv.config()['spark_execution_mode']
master_ip = utils.resolve_private_address(available_hosts['spark-master'])
master_url = self.get_master_url(master_ip)
+ req_driver_mem = hookenv.config()['driver_memory']
+ req_executor_mem = hookenv.config()['executor_memory']
+
+ # handle tuning options that may be set as percentages
+ driver_mem = '1g'
+ executor_mem = '1g'
+ if req_driver_mem.endswith('%'):
+ if mode == 'standalone' or mode.startswith('local'):
+ mem_mb = host.get_total_ram() / 1024 / 1024
+ req_percentage = float(req_driver_mem.strip('%')) / 100
+ driver_mem = str(int(mem_mb * req_percentage)) + 'm'
+ else:
+ hookenv.log("driver_memory percentage in non-local mode. Using 1g default.",
+ level=None)
+ else:
+ driver_mem = req_driver_mem
+
+ if req_executor_mem.endswith('%'):
+ if mode == 'standalone' or mode.startswith('local'):
+ mem_mb = host.get_total_ram() / 1024 / 1024
+ req_percentage = float(req_executor_mem.strip('%')) / 100
+ executor_mem = str(int(mem_mb * req_percentage)) + 'm'
+ else:
+ hookenv.log("executor_memory percentage in non-local mode. Using 1g default.",
+ level=None)
+ else:
+ executor_mem = req_executor_mem
# Setup hosts dict
hosts = {
@@ -196,7 +218,11 @@ class Spark(object):
if 'namenode' in available_hosts:
hosts['namenode'] = available_hosts['namenode']
events_log_dir = self.setup_hdfs_logs()
-
+ else:
+ # Bigtop includes a default hadoop_head_node if we do not specify
+ # any namenode info. To ensure spark standalone doesn't get
+ # invalid hadoop config, set our NN to an empty string.
+ hosts['namenode'] = ''
if 'resourcemanager' in available_hosts:
hosts['resourcemanager'] = available_hosts['resourcemanager']
@@ -215,6 +241,10 @@ class Spark(object):
'spark::common::master_url': master_url,
'spark::common::event_log_dir': events_log_dir,
'spark::common::history_log_dir': events_log_dir,
+ 'spark::common::extra_lib_dirs':
+ ':'.join(extra_libs) if extra_libs else None,
+ 'spark::common::driver_mem': driver_mem,
+ 'spark::common::executor_mem': executor_mem,
}
if zk_units:
zks = []
@@ -242,39 +272,6 @@ class Spark(object):
self.patch_worker_master_url(master_ip, master_url)
- # handle tuning options that may be set as percentages
- driver_mem = '1g'
- req_driver_mem = hookenv.config()['driver_memory']
- executor_mem = '1g'
- req_executor_mem = hookenv.config()['executor_memory']
- if req_driver_mem.endswith('%'):
- if mode == 'standalone' or mode.startswith('local'):
- mem_mb = host.get_total_ram() / 1024 / 1024
- req_percentage = float(req_driver_mem.strip('%')) / 100
- driver_mem = str(int(mem_mb * req_percentage)) + 'm'
- else:
- hookenv.log("driver_memory percentage in non-local mode. Using 1g default.",
- level=None)
- else:
- driver_mem = req_driver_mem
-
- if req_executor_mem.endswith('%'):
- if mode == 'standalone' or mode.startswith('local'):
- mem_mb = host.get_total_ram() / 1024 / 1024
- req_percentage = float(req_executor_mem.strip('%')) / 100
- executor_mem = str(int(mem_mb * req_percentage)) + 'm'
- else:
- hookenv.log("executor_memory percentage in non-local mode. Using 1g default.",
- level=None)
- else:
- executor_mem = req_executor_mem
-
- spark_env = '/etc/spark/conf/spark-env.sh'
- utils.re_edit_in_place(spark_env, {
- r'.*SPARK_DRIVER_MEMORY.*': 'export SPARK_DRIVER_MEMORY={}'.format(driver_mem),
- r'.*SPARK_EXECUTOR_MEMORY.*': 'export SPARK_EXECUTOR_MEMORY={}'.format(executor_mem),
- }, append_non_matches=True)
-
# Install SB (subsequent calls will reconfigure existing install)
# SparkBench looks for the spark master in /etc/environment
with utils.environment_edit_in_place('/etc/environment') as env:
diff --git a/bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py b/bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py
index b6b0ca7a..fc74fa10 100644
--- a/bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py
+++ b/bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py
@@ -38,6 +38,9 @@ def report_status():
elif mode == 'standalone' and is_state('leadership.is_leader'):
mode = mode + " - master"
+ if is_state('spark.cuda.configured'):
+ mode = mode + " with CUDA"
+
if is_state('spark.started'):
hookenv.status_set('active', 'ready ({})'.format(mode))
else:
@@ -74,8 +77,15 @@ def install_spark_standalone(zks, peers):
hookenv.log("Waiting 2m to ensure zk ensemble has settled: {}".format(zks))
time.sleep(120)
+ # Let spark know if we have cuda libs installed.
+ # NB: spark packages prereq hadoop (boo), so even in standalone mode, we'll
+ # have hadoop libs installed. May as well include them in our lib path.
+ extra_libs = ["/usr/lib/hadoop/lib/native"]
+ if is_state('cuda.installed'):
+ extra_libs.append("/usr/local/cuda/lib64")
+
spark = Spark()
- spark.configure(hosts, zks, peers)
+ spark.configure(hosts, zk_units=zks, peers=peers, extra_libs=extra_libs)
set_deployment_mode_state('spark.standalone.installed')
@@ -98,8 +108,13 @@ def install_spark_yarn():
nns = hadoop.namenodes()
hosts['namenode'] = nns[0]
+ # Always include native hadoop libs in yarn mode; add cuda libs if present.
+ extra_libs = ["/usr/lib/hadoop/lib/native"]
+ if is_state('cuda.installed'):
+ extra_libs.append("/usr/local/cuda/lib64")
+
spark = Spark()
- spark.configure(hosts, zk_units=None, peers=None)
+ spark.configure(hosts, zk_units=None, peers=None, extra_libs=extra_libs)
set_deployment_mode_state('spark.yarn.installed')
@@ -160,6 +175,7 @@ def reinstall_spark():
# If neither config nor our matrix is changing, there is nothing to do.
if not (is_state('config.changed') or
data_changed('deployment_matrix', deployment_matrix)):
+ report_status()
return
# (Re)install based on our execution mode
@@ -197,6 +213,47 @@ def leader_elected():
set_state("master.elected")
+@when('spark.started', 'cuda.installed')
+@when_not('spark.cuda.configured')
+def configure_cuda():
+ """
+ Ensure cuda bits are configured.
+
+ We can't be sure that the config.changed handler in the nvidia-cuda
+ layer will fire before the handler in this layer. We might call
+ reinstall_spark on config-changed before the cuda.installed state is set,
+ thereby missing the cuda lib path configuration. Deal with this by
+ excplicitly calling reinstall_spark after we *know* cuda.installed is set.
+ This may result in 2 calls to reinstall_spark when cuda-related config
+ changes, but it ensures our spark lib config is accurate.
+ """
+ hookenv.log("Configuring spark with CUDA library paths")
+ reinstall_spark()
+ set_state('spark.cuda.configured')
+ report_status()
+
+
+@when('spark.started', 'spark.cuda.configured')
+@when_not('cuda.installed')
+def unconfigure_cuda():
+ """
+ Ensure cuda bits are unconfigured.
+
+ Similar to the configure_cuda method, we can't be sure that the
+ config.changed handler in the nvidia-cuda layer will fire before the
+ handler in this layer. We might call reinstall_spark on config-changed
+ before the cuda.installed state is removed, thereby configuring spark with
+ a cuda lib path when the user wanted cuda config removed. Deal with this by
+ excplicitly calling reinstall_spark after we *know* cuda.installed is gone.
+ This may result in 2 calls to reinstall_spark when cuda-related config
+ changes, but it ensures our spark lib config is accurate.
+ """
+ hookenv.log("Removing CUDA library paths from spark configuration")
+ reinstall_spark()
+ remove_state('spark.cuda.configured')
+ report_status()
+
+
@when('spark.started', 'client.joined')
def client_present(client):
if is_state('leadership.is_leader'):