107 files changed, 10755 insertions, 1782 deletions
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index b4c216bab22b..bea8e425a8de 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -128,7 +128,7 @@ static int acpi_ec_add_debugfs(struct acpi_ec *ec, unsigned int ec_device_count)
 	if (!debugfs_create_x32("gpe", 0444, dev_dir, (u32 *)&first_ec->gpe))
 		goto error;
 	if (!debugfs_create_bool("use_global_lock", 0444, dev_dir,
-				 (u32 *)&first_ec->global_lock))
+				 &first_ec->global_lock))
 		goto error;
 
 	if (write_support)
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 7db7f9dd7c47..136b7d5159d4 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -135,7 +135,7 @@ struct acpi_ec {
 	unsigned long gpe;
 	unsigned long command_addr;
 	unsigned long data_addr;
-	unsigned long global_lock;
+	bool global_lock;
 	unsigned long flags;
 	unsigned long reference_count;
 	struct mutex mutex;
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index d24fa1964eb8..6d4e44ea74ac 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -800,7 +800,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
 				result =
 					thermal_zone_bind_cooling_device
 					(thermal, trip, cdev,
-					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT);
+					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+					 THERMAL_WEIGHT_DEFAULT);
 			else
 				result =
 					thermal_zone_unbind_cooling_device
@@ -824,7 +825,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
 			if (bind)
 				result = thermal_zone_bind_cooling_device
 					(thermal, trip, cdev,
-					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT);
+					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+					 THERMAL_WEIGHT_DEFAULT);
 			else
 				result = thermal_zone_unbind_cooling_device
 					(thermal, trip, cdev);
@@ -841,7 +843,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
 				result = thermal_zone_bind_cooling_device
 						(thermal, THERMAL_TRIPS_NONE,
 						 cdev, THERMAL_NO_LIMIT,
-						 THERMAL_NO_LIMIT);
+						 THERMAL_NO_LIMIT,
+						 THERMAL_WEIGHT_DEFAULT);
 			else
 				result = thermal_zone_unbind_cooling_device
 						(thermal, THERMAL_TRIPS_NONE,
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 1cb8544598d5..49fddd1964cc 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_PM)	+= sysfs.o generic_ops.o common.o qos.o runtime.o
 obj-$(CONFIG_PM_SLEEP)	+= main.o wakeup.o
 obj-$(CONFIG_PM_TRACE_RTC)	+= trace.o
-obj-$(CONFIG_PM_OPP)	+= opp.o
+obj-$(CONFIG_PM_OPP)	+= opp/
 obj-$(CONFIG_PM_GENERIC_DOMAINS)	+=  domain.o domain_governor.o
 obj-$(CONFIG_HAVE_CLK)	+= clock_ops.o
 
diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
deleted file mode 100644
index 677fb2843553..000000000000
--- a/drivers/base/power/opp.c
+++ /dev/null
@@ -1,927 +0,0 @@
-/*
- * Generic OPP Interface
- *
- * Copyright (C) 2009-2010 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/pm_opp.h>
-#include <linux/of.h>
-#include <linux/export.h>
-
-/*
- * Internal data structure organization with the OPP layer library is as
- * follows:
- * dev_opp_list (root)
- *	|- device 1 (represents voltage domain 1)
- *	|	|- opp 1 (availability, freq, voltage)
- *	|	|- opp 2 ..
- *	...	...
- *	|	`- opp n ..
- *	|- device 2 (represents the next voltage domain)
- *	...
- *	`- device m (represents mth voltage domain)
- * device 1, 2.. are represented by dev_opp structure while each opp
- * is represented by the opp structure.
- */
-
-/**
- * struct dev_pm_opp - Generic OPP description structure
- * @node:	opp list node. The nodes are maintained throughout the lifetime
- *		of boot. It is expected only an optimal set of OPPs are
- *		added to the library by the SoC framework.
- *		RCU usage: opp list is traversed with RCU locks. node
- *		modification is possible realtime, hence the modifications
- *		are protected by the dev_opp_list_lock for integrity.
- *		IMPORTANT: the opp nodes should be maintained in increasing
- *		order.
- * @dynamic:	not-created from static DT entries.
- * @available:	true/false - marks if this OPP as available or not
- * @rate:	Frequency in hertz
- * @u_volt:	Nominal voltage in microvolts corresponding to this OPP
- * @dev_opp:	points back to the device_opp struct this opp belongs to
- * @rcu_head:	RCU callback head used for deferred freeing
- *
- * This structure stores the OPP information for a given device.
- */
-struct dev_pm_opp {
-	struct list_head node;
-
-	bool available;
-	bool dynamic;
-	unsigned long rate;
-	unsigned long u_volt;
-
-	struct device_opp *dev_opp;
-	struct rcu_head rcu_head;
-};
-
-/**
- * struct device_opp - Device opp structure
- * @node:	list node - contains the devices with OPPs that
- *		have been registered. Nodes once added are not modified in this
- *		list.
- *		RCU usage: nodes are not modified in the list of device_opp,
- *		however addition is possible and is secured by dev_opp_list_lock
- * @dev:	device pointer
- * @srcu_head:	notifier head to notify the OPP availability changes.
- * @rcu_head:	RCU callback head used for deferred freeing
- * @opp_list:	list of opps
- *
- * This is an internal data structure maintaining the link to opps attached to
- * a device. This structure is not meant to be shared to users as it is
- * meant for book keeping and private to OPP library.
- *
- * Because the opp structures can be used from both rcu and srcu readers, we
- * need to wait for the grace period of both of them before freeing any
- * resources. And so we have used kfree_rcu() from within call_srcu() handlers.
- */
-struct device_opp {
-	struct list_head node;
-
-	struct device *dev;
-	struct srcu_notifier_head srcu_head;
-	struct rcu_head rcu_head;
-	struct list_head opp_list;
-};
-
-/*
- * The root of the list of all devices. All device_opp structures branch off
- * from here, with each device_opp containing the list of opp it supports in
- * various states of availability.
- */
-static LIST_HEAD(dev_opp_list);
-/* Lock to allow exclusive modification to the device and opp lists */
-static DEFINE_MUTEX(dev_opp_list_lock);
-
-#define opp_rcu_lockdep_assert()					\
-do {									\
-	rcu_lockdep_assert(rcu_read_lock_held() ||			\
-				lockdep_is_held(&dev_opp_list_lock),	\
-			   "Missing rcu_read_lock() or "		\
-			   "dev_opp_list_lock protection");		\
-} while (0)
-
-/**
- * _find_device_opp() - find device_opp struct using device pointer
- * @dev:	device pointer used to lookup device OPPs
- *
- * Search list of device OPPs for one containing matching device. Does a RCU
- * reader operation to grab the pointer needed.
- *
- * Return: pointer to 'struct device_opp' if found, otherwise -ENODEV or
- * -EINVAL based on type of error.
- *
- * Locking: This function must be called under rcu_read_lock(). device_opp
- * is a RCU protected pointer. This means that device_opp is valid as long
- * as we are under RCU lock.
- */
-static struct device_opp *_find_device_opp(struct device *dev)
-{
-	struct device_opp *tmp_dev_opp, *dev_opp = ERR_PTR(-ENODEV);
-
-	if (unlikely(IS_ERR_OR_NULL(dev))) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return ERR_PTR(-EINVAL);
-	}
-
-	list_for_each_entry_rcu(tmp_dev_opp, &dev_opp_list, node) {
-		if (tmp_dev_opp->dev == dev) {
-			dev_opp = tmp_dev_opp;
-			break;
-		}
-	}
-
-	return dev_opp;
-}
-
-/**
- * dev_pm_opp_get_voltage() - Gets the voltage corresponding to an available opp
- * @opp:	opp for which voltage has to be returned for
- *
- * Return: voltage in micro volt corresponding to the opp, else
- * return 0
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. This means that opp which could have been fetched by
- * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
- * under RCU lock. The pointer returned by the opp_find_freq family must be
- * used in the same section as the usage of this function with the pointer
- * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
- * pointer.
- */
-unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
-{
-	struct dev_pm_opp *tmp_opp;
-	unsigned long v = 0;
-
-	opp_rcu_lockdep_assert();
-
-	tmp_opp = rcu_dereference(opp);
-	if (unlikely(IS_ERR_OR_NULL(tmp_opp)) || !tmp_opp->available)
-		pr_err("%s: Invalid parameters\n", __func__);
-	else
-		v = tmp_opp->u_volt;
-
-	return v;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
-
-/**
- * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
- * @opp:	opp for which frequency has to be returned for
- *
- * Return: frequency in hertz corresponding to the opp, else
- * return 0
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. This means that opp which could have been fetched by
- * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
- * under RCU lock. The pointer returned by the opp_find_freq family must be
- * used in the same section as the usage of this function with the pointer
- * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
- * pointer.
- */
-unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
-{
-	struct dev_pm_opp *tmp_opp;
-	unsigned long f = 0;
-
-	opp_rcu_lockdep_assert();
-
-	tmp_opp = rcu_dereference(opp);
-	if (unlikely(IS_ERR_OR_NULL(tmp_opp)) || !tmp_opp->available)
-		pr_err("%s: Invalid parameters\n", __func__);
-	else
-		f = tmp_opp->rate;
-
-	return f;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
-
-/**
- * dev_pm_opp_get_opp_count() - Get number of opps available in the opp list
- * @dev:	device for which we do this operation
- *
- * Return: This function returns the number of available opps if there are any,
- * else returns 0 if none or the corresponding error value.
- *
- * Locking: This function takes rcu_read_lock().
- */
-int dev_pm_opp_get_opp_count(struct device *dev)
-{
-	struct device_opp *dev_opp;
-	struct dev_pm_opp *temp_opp;
-	int count = 0;
-
-	rcu_read_lock();
-
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		count = PTR_ERR(dev_opp);
-		dev_err(dev, "%s: device OPP not found (%d)\n",
-			__func__, count);
-		goto out_unlock;
-	}
-
-	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
-		if (temp_opp->available)
-			count++;
-	}
-
-out_unlock:
-	rcu_read_unlock();
-	return count;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
-
-/**
- * dev_pm_opp_find_freq_exact() - search for an exact frequency
- * @dev:		device for which we do this operation
- * @freq:		frequency to search for
- * @available:		true/false - match for available opp
- *
- * Return: Searches for exact match in the opp list and returns pointer to the
- * matching opp if found, else returns ERR_PTR in case of error and should
- * be handled using IS_ERR. Error return values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * Note: available is a modifier for the search. if available=true, then the
- * match is for exact matching frequency and is available in the stored OPP
- * table. if false, the match is for exact frequency which is not available.
- *
- * This provides a mechanism to enable an opp which is not available currently
- * or the opposite as well.
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
-					      unsigned long freq,
-					      bool available)
-{
-	struct device_opp *dev_opp;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	opp_rcu_lockdep_assert();
-
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		int r = PTR_ERR(dev_opp);
-		dev_err(dev, "%s: device OPP not found (%d)\n", __func__, r);
-		return ERR_PTR(r);
-	}
-
-	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
-		if (temp_opp->available == available &&
-				temp_opp->rate == freq) {
-			opp = temp_opp;
-			break;
-		}
-	}
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
-
-/**
- * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
- * @dev:	device for which we do this operation
- * @freq:	Start frequency
- *
- * Search for the matching ceil *available* OPP from a starting freq
- * for a device.
- *
- * Return: matching *opp and refreshes *freq accordingly, else returns
- * ERR_PTR in case of error and should be handled using IS_ERR. Error return
- * values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
-					     unsigned long *freq)
-{
-	struct device_opp *dev_opp;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	opp_rcu_lockdep_assert();
-
-	if (!dev || !freq) {
-		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
-		return ERR_PTR(-EINVAL);
-	}
-
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp))
-		return ERR_CAST(dev_opp);
-
-	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
-		if (temp_opp->available && temp_opp->rate >= *freq) {
-			opp = temp_opp;
-			*freq = opp->rate;
-			break;
-		}
-	}
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
-
-/**
- * dev_pm_opp_find_freq_floor() - Search for a rounded floor freq
- * @dev:	device for which we do this operation
- * @freq:	Start frequency
- *
- * Search for the matching floor *available* OPP from a starting freq
- * for a device.
- *
- * Return: matching *opp and refreshes *freq accordingly, else returns
- * ERR_PTR in case of error and should be handled using IS_ERR. Error return
- * values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
-					      unsigned long *freq)
-{
-	struct device_opp *dev_opp;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	opp_rcu_lockdep_assert();
-
-	if (!dev || !freq) {
-		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
-		return ERR_PTR(-EINVAL);
-	}
-
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp))
-		return ERR_CAST(dev_opp);
-
-	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
-		if (temp_opp->available) {
-			/* go to the next node, before choosing prev */
-			if (temp_opp->rate > *freq)
-				break;
-			else
-				opp = temp_opp;
-		}
-	}
-	if (!IS_ERR(opp))
-		*freq = opp->rate;
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
-
-/**
- * _add_device_opp() - Allocate a new device OPP table
- * @dev:	device for which we do this operation
- *
- * New device node which uses OPPs - used when multiple devices with OPP tables
- * are maintained.
- *
- * Return: valid device_opp pointer if success, else NULL.
- */
-static struct device_opp *_add_device_opp(struct device *dev)
-{
-	struct device_opp *dev_opp;
-
-	/*
-	 * Allocate a new device OPP table. In the infrequent case where a new
-	 * device is needed to be added, we pay this penalty.
-	 */
-	dev_opp = kzalloc(sizeof(*dev_opp), GFP_KERNEL);
-	if (!dev_opp)
-		return NULL;
-
-	dev_opp->dev = dev;
-	srcu_init_notifier_head(&dev_opp->srcu_head);
-	INIT_LIST_HEAD(&dev_opp->opp_list);
-
-	/* Secure the device list modification */
-	list_add_rcu(&dev_opp->node, &dev_opp_list);
-	return dev_opp;
-}
-
-/**
- * _opp_add_dynamic() - Allocate a dynamic OPP.
- * @dev:	device for which we do this operation
- * @freq:	Frequency in Hz for this OPP
- * @u_volt:	Voltage in uVolts for this OPP
- * @dynamic:	Dynamically added OPPs.
- *
- * This function adds an opp definition to the opp list and returns status.
- * The opp is made available by default and it can be controlled using
- * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
- *
- * NOTE: "dynamic" parameter impacts OPPs added by the of_init_opp_table and
- * freed by of_free_opp_table.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- */
-static int _opp_add_dynamic(struct device *dev, unsigned long freq,
-			    long u_volt, bool dynamic)
-{
-	struct device_opp *dev_opp = NULL;
-	struct dev_pm_opp *opp, *new_opp;
-	struct list_head *head;
-	int ret;
-
-	/* allocate new OPP node */
-	new_opp = kzalloc(sizeof(*new_opp), GFP_KERNEL);
-	if (!new_opp)
-		return -ENOMEM;
-
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
-
-	/* populate the opp table */
-	new_opp->rate = freq;
-	new_opp->u_volt = u_volt;
-	new_opp->available = true;
-	new_opp->dynamic = dynamic;
-
-	/* Check for existing list for 'dev' */
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		dev_opp = _add_device_opp(dev);
-		if (!dev_opp) {
-			ret = -ENOMEM;
-			goto free_opp;
-		}
-
-		head = &dev_opp->opp_list;
-		goto list_add;
-	}
-
-	/*
-	 * Insert new OPP in order of increasing frequency
-	 * and discard if already present
-	 */
-	head = &dev_opp->opp_list;
-	list_for_each_entry_rcu(opp, &dev_opp->opp_list, node) {
-		if (new_opp->rate <= opp->rate)
-			break;
-		else
-			head = &opp->node;
-	}
-
-	/* Duplicate OPPs ? */
-	if (new_opp->rate == opp->rate) {
-		ret = opp->available && new_opp->u_volt == opp->u_volt ?
-			0 : -EEXIST;
-
-		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
-			 __func__, opp->rate, opp->u_volt, opp->available,
-			 new_opp->rate, new_opp->u_volt, new_opp->available);
-		goto free_opp;
-	}
-
-list_add:
-	new_opp->dev_opp = dev_opp;
-	list_add_rcu(&new_opp->node, head);
-	mutex_unlock(&dev_opp_list_lock);
-
-	/*
-	 * Notify the changes in the availability of the operable
-	 * frequency/voltage list.
-	 */
-	srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp);
-	return 0;
-
-free_opp:
-	mutex_unlock(&dev_opp_list_lock);
-	kfree(new_opp);
-	return ret;
-}
-
-/**
- * dev_pm_opp_add()  - Add an OPP table from a table definitions
- * @dev:	device for which we do this operation
- * @freq:	Frequency in Hz for this OPP
- * @u_volt:	Voltage in uVolts for this OPP
- *
- * This function adds an opp definition to the opp list and returns status.
- * The opp is made available by default and it can be controlled using
- * dev_pm_opp_enable/disable functions.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- */
-int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
-{
-	return _opp_add_dynamic(dev, freq, u_volt, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_add);
-
-/**
- * _kfree_opp_rcu() - Free OPP RCU handler
- * @head:	RCU head
- */
-static void _kfree_opp_rcu(struct rcu_head *head)
-{
-	struct dev_pm_opp *opp = container_of(head, struct dev_pm_opp, rcu_head);
-
-	kfree_rcu(opp, rcu_head);
-}
-
-/**
- * _kfree_device_rcu() - Free device_opp RCU handler
- * @head:	RCU head
- */
-static void _kfree_device_rcu(struct rcu_head *head)
-{
-	struct device_opp *device_opp = container_of(head, struct device_opp, rcu_head);
-
-	kfree_rcu(device_opp, rcu_head);
-}
-
-/**
- * _opp_remove()  - Remove an OPP from a table definition
- * @dev_opp:	points back to the device_opp struct this opp belongs to
- * @opp:	pointer to the OPP to remove
- *
- * This function removes an opp definition from the opp list.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * It is assumed that the caller holds required mutex for an RCU updater
- * strategy.
- */
-static void _opp_remove(struct device_opp *dev_opp,
-			struct dev_pm_opp *opp)
-{
-	/*
-	 * Notify the changes in the availability of the operable
-	 * frequency/voltage list.
-	 */
-	srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_REMOVE, opp);
-	list_del_rcu(&opp->node);
-	call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
-
-	if (list_empty(&dev_opp->opp_list)) {
-		list_del_rcu(&dev_opp->node);
-		call_srcu(&dev_opp->srcu_head.srcu, &dev_opp->rcu_head,
-			  _kfree_device_rcu);
-	}
-}
-
-/**
- * dev_pm_opp_remove()  - Remove an OPP from OPP list
- * @dev:	device for which we do this operation
- * @freq:	OPP to remove with matching 'freq'
- *
- * This function removes an opp from the opp list.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- */
-void dev_pm_opp_remove(struct device *dev, unsigned long freq)
-{
-	struct dev_pm_opp *opp;
-	struct device_opp *dev_opp;
-	bool found = false;
-
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
-
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp))
-		goto unlock;
-
-	list_for_each_entry(opp, &dev_opp->opp_list, node) {
-		if (opp->rate == freq) {
-			found = true;
-			break;
-		}
-	}
-
-	if (!found) {
-		dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n",
-			 __func__, freq);
-		goto unlock;
-	}
-
-	_opp_remove(dev_opp, opp);
-unlock:
-	mutex_unlock(&dev_opp_list_lock);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
-
-/**
- * _opp_set_availability() - helper to set the availability of an opp
- * @dev:		device for which we do this operation
- * @freq:		OPP frequency to modify availability
- * @availability_req:	availability status requested for this opp
- *
- * Set the availability of an OPP with an RCU operation, opp_{enable,disable}
- * share a common logic which is isolated here.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
- * successful.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks to
- * keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex locking or synchronize_rcu() blocking calls cannot be used.
- */
-static int _opp_set_availability(struct device *dev, unsigned long freq,
-				 bool availability_req)
-{
-	struct device_opp *dev_opp;
-	struct dev_pm_opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV);
-	int r = 0;
-
-	/* keep the node allocated */
-	new_opp = kmalloc(sizeof(*new_opp), GFP_KERNEL);
-	if (!new_opp)
-		return -ENOMEM;
-
-	mutex_lock(&dev_opp_list_lock);
-
-	/* Find the device_opp */
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		r = PTR_ERR(dev_opp);
-		dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
-		goto unlock;
-	}
-
-	/* Do we have the frequency? */
-	list_for_each_entry(tmp_opp, &dev_opp->opp_list, node) {
-		if (tmp_opp->rate == freq) {
-			opp = tmp_opp;
-			break;
-		}
-	}
-	if (IS_ERR(opp)) {
-		r = PTR_ERR(opp);
-		goto unlock;
-	}
-
-	/* Is update really needed? */
-	if (opp->available == availability_req)
-		goto unlock;
-	/* copy the old data over */
-	*new_opp = *opp;
-
-	/* plug in new node */
-	new_opp->available = availability_req;
-
-	list_replace_rcu(&opp->node, &new_opp->node);
-	mutex_unlock(&dev_opp_list_lock);
-	call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
-
-	/* Notify the change of the OPP availability */
-	if (availability_req)
-		srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ENABLE,
-					 new_opp);
-	else
-		srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_DISABLE,
-					 new_opp);
-
-	return 0;
-
-unlock:
-	mutex_unlock(&dev_opp_list_lock);
-	kfree(new_opp);
-	return r;
-}
-
-/**
- * dev_pm_opp_enable() - Enable a specific OPP
- * @dev:	device for which we do this operation
- * @freq:	OPP frequency to enable
- *
- * Enables a provided opp. If the operation is valid, this returns 0, else the
- * corresponding error value. It is meant to be used for users an OPP available
- * after being temporarily made unavailable with dev_pm_opp_disable.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function indirectly uses RCU and mutex locks to keep the
- * integrity of the internal data structures. Callers should ensure that
- * this function is *NOT* called under RCU protection or in contexts where
- * mutex locking or synchronize_rcu() blocking calls cannot be used.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
- * successful.
- */
-int dev_pm_opp_enable(struct device *dev, unsigned long freq)
-{
-	return _opp_set_availability(dev, freq, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
-
-/**
- * dev_pm_opp_disable() - Disable a specific OPP
- * @dev:	device for which we do this operation
- * @freq:	OPP frequency to disable
- *
- * Disables a provided opp. If the operation is valid, this returns
- * 0, else the corresponding error value. It is meant to be a temporary
- * control by users to make this OPP not available until the circumstances are
- * right to make it available again (with a call to dev_pm_opp_enable).
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function indirectly uses RCU and mutex locks to keep the
- * integrity of the internal data structures. Callers should ensure that
- * this function is *NOT* called under RCU protection or in contexts where
- * mutex locking or synchronize_rcu() blocking calls cannot be used.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
- * successful.
- */
-int dev_pm_opp_disable(struct device *dev, unsigned long freq)
-{
-	return _opp_set_availability(dev, freq, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
-
-/**
- * dev_pm_opp_get_notifier() - find notifier_head of the device with opp
- * @dev:	device pointer used to lookup device OPPs.
- *
- * Return: pointer to  notifier head if found, otherwise -ENODEV or
- * -EINVAL based on type of error casted as pointer. value must be checked
- *  with IS_ERR to determine valid pointer or error result.
- *
- * Locking: This function must be called under rcu_read_lock(). dev_opp is a RCU
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
- */
-struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev)
-{
-	struct device_opp *dev_opp = _find_device_opp(dev);
-
-	if (IS_ERR(dev_opp))
-		return ERR_CAST(dev_opp); /* matching type */
-
-	return &dev_opp->srcu_head;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
-
-#ifdef CONFIG_OF
-/**
- * of_init_opp_table() - Initialize opp table from device tree
- * @dev:	device pointer used to lookup device OPPs.
- *
- * Register the initial OPP table with the OPP library for given device.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function indirectly uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- * -ENODEV	when 'operating-points' property is not found or is invalid data
- *		in device node.
- * -ENODATA	when empty 'operating-points' property is found
- */
-int of_init_opp_table(struct device *dev)
-{
-	const struct property *prop;
-	const __be32 *val;
-	int nr;
-
-	prop = of_find_property(dev->of_node, "operating-points", NULL);
-	if (!prop)
-		return -ENODEV;
-	if (!prop->value)
-		return -ENODATA;
-
-	/*
-	 * Each OPP is a set of tuples consisting of frequency and
-	 * voltage like <freq-kHz vol-uV>.
-	 */
-	nr = prop->length / sizeof(u32);
-	if (nr % 2) {
-		dev_err(dev, "%s: Invalid OPP list\n", __func__);
-		return -EINVAL;
-	}
-
-	val = prop->value;
-	while (nr) {
-		unsigned long freq = be32_to_cpup(val++) * 1000;
-		unsigned long volt = be32_to_cpup(val++);
-
-		if (_opp_add_dynamic(dev, freq, volt, false))
-			dev_warn(dev, "%s: Failed to add OPP %ld\n",
-				 __func__, freq);
-		nr -= 2;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(of_init_opp_table);
-
-/**
- * of_free_opp_table() - Free OPP table entries created from static DT entries
- * @dev:	device pointer used to lookup device OPPs.
- *
- * Free OPPs created using static entries present in DT.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function indirectly uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- */
-void of_free_opp_table(struct device *dev)
-{
-	struct device_opp *dev_opp;
-	struct dev_pm_opp *opp, *tmp;
-
-	/* Check for existing list for 'dev' */
-	dev_opp = _find_device_opp(dev);
-	if (IS_ERR(dev_opp)) {
-		int error = PTR_ERR(dev_opp);
-		if (error != -ENODEV)
-			WARN(1, "%s: dev_opp: %d\n",
-			     IS_ERR_OR_NULL(dev) ?
-					"Invalid device" : dev_name(dev),
-			     error);
-		return;
-	}
-
-	/* Hold our list modification lock here */
-	mutex_lock(&dev_opp_list_lock);
-
-	/* Free static OPPs */
-	list_for_each_entry_safe(opp, tmp, &dev_opp->opp_list, node) {
-		if (!opp->dynamic)
-			_opp_remove(dev_opp, opp);
-	}
-
-	mutex_unlock(&dev_opp_list_lock);
-}
-EXPORT_SYMBOL_GPL(of_free_opp_table);
-#endif
diff --git a/drivers/base/power/opp/Makefile b/drivers/base/power/opp/Makefile
new file mode 100644
index 000000000000..19837ef04d8e
--- /dev/null
+++ b/drivers/base/power/opp/Makefile
@@ -0,0 +1,3 @@
+ccflags-$(CONFIG_DEBUG_DRIVER)	:= -DDEBUG
+obj-y				+= core.o cpu.o
+obj-$(CONFIG_DEBUG_FS)		+= debugfs.o
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
new file mode 100644
index 000000000000..333161fd623c
--- /dev/null
+++ b/drivers/base/power/opp/core.c
@@ -0,0 +1,2053 @@
+/*
+ * Generic OPP Interface
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/clk.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/export.h>
+#include <linux/regulator/consumer.h>
+
+#include "opp.h"
+
+/*
+ * The root of the list of all opp-tables. All opp_table structures branch off
+ * from here, with each opp_table containing the list of opps it supports in
+ * various states of availability.
+ */
+static LIST_HEAD(opp_tables);
+/* Lock to allow exclusive modification to the device and opp lists */
+DEFINE_MUTEX(opp_table_lock);
+
+#define opp_rcu_lockdep_assert()					\
+do {									\
+	rcu_lockdep_assert(rcu_read_lock_held() ||			\
+			   lockdep_is_held(&opp_table_lock),		\
+			   "Missing rcu_read_lock() or "		\
+			   "opp_table_lock protection");		\
+} while (0)
+
+static struct opp_device *_find_opp_dev(const struct device *dev,
+					struct opp_table *opp_table)
+{
+	struct opp_device *opp_dev;
+
+	list_for_each_entry(opp_dev, &opp_table->dev_list, node)
+		if (opp_dev->dev == dev)
+			return opp_dev;
+
+	return NULL;
+}
+
+static struct opp_table *_managed_opp(const struct device_node *np)
+{
+	struct opp_table *opp_table;
+
+	list_for_each_entry_rcu(opp_table, &opp_tables, node) {
+		if (opp_table->np == np) {
+			/*
+			 * Multiple devices can point to the same OPP table and
+			 * so will have same node-pointer, np.
+			 *
+			 * But the OPPs will be considered as shared only if the
+			 * OPP table contains a "opp-shared" property.
+			 */
+			return opp_table->shared_opp ? opp_table : NULL;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * _find_opp_table() - find opp_table struct using device pointer
+ * @dev:	device pointer used to lookup OPP table
+ *
+ * Search OPP table for one containing matching device. Does a RCU reader
+ * operation to grab the pointer needed.
+ *
+ * Return: pointer to 'struct opp_table' if found, otherwise -ENODEV or
+ * -EINVAL based on type of error.
+ *
+ * Locking: For readers, this function must be called under rcu_read_lock().
+ * opp_table is a RCU protected pointer, which means that opp_table is valid
+ * as long as we are under RCU lock.
+ *
+ * For Writers, this function must be called with opp_table_lock held.
+ */
+struct opp_table *_find_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	opp_rcu_lockdep_assert();
+
+	if (IS_ERR_OR_NULL(dev)) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	list_for_each_entry_rcu(opp_table, &opp_tables, node)
+		if (_find_opp_dev(dev, opp_table))
+			return opp_table;
+
+	return ERR_PTR(-ENODEV);
+}
+
+/**
+ * dev_pm_opp_get_voltage() - Gets the voltage corresponding to an opp
+ * @opp:	opp for which voltage has to be returned for
+ *
+ * Return: voltage in micro volt corresponding to the opp, else
+ * return 0
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. This means that opp which could have been fetched by
+ * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
+ * under RCU lock. The pointer returned by the opp_find_freq family must be
+ * used in the same section as the usage of this function with the pointer
+ * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
+ * pointer.
+ */
+unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
+{
+	struct dev_pm_opp *tmp_opp;
+	unsigned long v = 0;
+
+	opp_rcu_lockdep_assert();
+
+	tmp_opp = rcu_dereference(opp);
+	if (IS_ERR_OR_NULL(tmp_opp))
+		pr_err("%s: Invalid parameters\n", __func__);
+	else
+		v = tmp_opp->u_volt;
+
+	return v;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
+
+/**
+ * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
+ * @opp:	opp for which frequency has to be returned for
+ *
+ * Return: frequency in hertz corresponding to the opp, else
+ * return 0
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. This means that opp which could have been fetched by
+ * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
+ * under RCU lock. The pointer returned by the opp_find_freq family must be
+ * used in the same section as the usage of this function with the pointer
+ * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
+ * pointer.
+ */
+unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
+{
+	struct dev_pm_opp *tmp_opp;
+	unsigned long f = 0;
+
+	opp_rcu_lockdep_assert();
+
+	tmp_opp = rcu_dereference(opp);
+	if (IS_ERR_OR_NULL(tmp_opp) || !tmp_opp->available)
+		pr_err("%s: Invalid parameters\n", __func__);
+	else
+		f = tmp_opp->rate;
+
+	return f;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
+
+/**
+ * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not
+ * @opp: opp for which turbo mode is being verified
+ *
+ * Turbo OPPs are not for normal use, and can be enabled (under certain
+ * conditions) for short duration of times to finish high throughput work
+ * quickly. Running on them for longer times may overheat the chip.
+ *
+ * Return: true if opp is turbo opp, else false.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. This means that opp which could have been fetched by
+ * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
+ * under RCU lock. The pointer returned by the opp_find_freq family must be
+ * used in the same section as the usage of this function with the pointer
+ * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
+ * pointer.
+ */
+bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
+{
+	struct dev_pm_opp *tmp_opp;
+
+	opp_rcu_lockdep_assert();
+
+	tmp_opp = rcu_dereference(opp);
+	if (IS_ERR_OR_NULL(tmp_opp) || !tmp_opp->available) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return false;
+	}
+
+	return tmp_opp->turbo;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
+
+/**
+ * dev_pm_opp_get_max_clock_latency() - Get max clock latency in nanoseconds
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the max clock latency in nanoseconds.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
+{
+	struct opp_table *opp_table;
+	unsigned long clock_latency_ns;
+
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		clock_latency_ns = 0;
+	else
+		clock_latency_ns = opp_table->clock_latency_ns_max;
+
+	rcu_read_unlock();
+	return clock_latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
+
+/**
+ * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max voltage latency in nanoseconds.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *opp;
+	struct regulator *reg;
+	unsigned long latency_ns = 0;
+	unsigned long min_uV = ~0, max_uV = 0;
+	int ret;
+
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	reg = opp_table->regulator;
+	if (IS_ERR(reg)) {
+		/* Regulator may not be required for device */
+		if (reg)
+			dev_err(dev, "%s: Invalid regulator (%ld)\n", __func__,
+				PTR_ERR(reg));
+		rcu_read_unlock();
+		return 0;
+	}
+
+	list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
+		if (!opp->available)
+			continue;
+
+		if (opp->u_volt_min < min_uV)
+			min_uV = opp->u_volt_min;
+		if (opp->u_volt_max > max_uV)
+			max_uV = opp->u_volt_max;
+	}
+
+	rcu_read_unlock();
+
+	/*
+	 * The caller needs to ensure that opp_table (and hence the regulator)
+	 * isn't freed, while we are executing this routine.
+	 */
+	ret = regulator_set_voltage_time(reg, min_uV, max_uV);
+	if (ret > 0)
+		latency_ns = ret * 1000;
+
+	return latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
+
+/**
+ * dev_pm_opp_get_max_transition_latency() - Get max transition latency in
+ *					     nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max transition latency, in nanoseconds, to
+ * switch from one OPP to other.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+	return dev_pm_opp_get_max_volt_latency(dev) +
+		dev_pm_opp_get_max_clock_latency(dev);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
+
+/**
+ * dev_pm_opp_get_suspend_opp() - Get suspend opp
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns pointer to the suspend opp if it is
+ * defined and available, otherwise it returns NULL.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	opp_rcu_lockdep_assert();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table) || !opp_table->suspend_opp ||
+	    !opp_table->suspend_opp->available)
+		return NULL;
+
+	return opp_table->suspend_opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
+
+/**
+ * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the number of available opps if there are any,
+ * else returns 0 if none or the corresponding error value.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+int dev_pm_opp_get_opp_count(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp;
+	int count = 0;
+
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		count = PTR_ERR(opp_table);
+		dev_err(dev, "%s: OPP table not found (%d)\n",
+			__func__, count);
+		goto out_unlock;
+	}
+
+	list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available)
+			count++;
+	}
+
+out_unlock:
+	rcu_read_unlock();
+	return count;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
+
+/**
+ * dev_pm_opp_find_freq_exact() - search for an exact frequency
+ * @dev:		device for which we do this operation
+ * @freq:		frequency to search for
+ * @available:		true/false - match for available opp
+ *
+ * Return: Searches for exact match in the opp table and returns pointer to the
+ * matching opp if found, else returns ERR_PTR in case of error and should
+ * be handled using IS_ERR. Error return values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * Note: available is a modifier for the search. if available=true, then the
+ * match is for exact matching frequency and is available in the stored OPP
+ * table. if false, the match is for exact frequency which is not available.
+ *
+ * This provides a mechanism to enable an opp which is not available currently
+ * or the opposite as well.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
+					      unsigned long freq,
+					      bool available)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	opp_rcu_lockdep_assert();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int r = PTR_ERR(opp_table);
+
+		dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
+		return ERR_PTR(r);
+	}
+
+	list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available == available &&
+				temp_opp->rate == freq) {
+			opp = temp_opp;
+			break;
+		}
+	}
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
+
+/**
+ * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
+ * @dev:	device for which we do this operation
+ * @freq:	Start frequency
+ *
+ * Search for the matching ceil *available* OPP from a starting freq
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *freq accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
+					     unsigned long *freq)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	opp_rcu_lockdep_assert();
+
+	if (!dev || !freq) {
+		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
+		return ERR_PTR(-EINVAL);
+	}
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
+
+	list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available && temp_opp->rate >= *freq) {
+			opp = temp_opp;
+			*freq = opp->rate;
+			break;
+		}
+	}
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
+
+/**
+ * dev_pm_opp_find_freq_floor() - Search for a rounded floor freq
+ * @dev:	device for which we do this operation
+ * @freq:	Start frequency
+ *
+ * Search for the matching floor *available* OPP from a starting freq
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *freq accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
+					      unsigned long *freq)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	opp_rcu_lockdep_assert();
+
+	if (!dev || !freq) {
+		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
+		return ERR_PTR(-EINVAL);
+	}
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
+
+	list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available) {
+			/* go to the next node, before choosing prev */
+			if (temp_opp->rate > *freq)
+				break;
+			else
+				opp = temp_opp;
+		}
+	}
+	if (!IS_ERR(opp))
+		*freq = opp->rate;
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
+
+/*
+ * The caller needs to ensure that opp_table (and hence the clk) isn't freed,
+ * while clk returned here is used.
+ */
+static struct clk *_get_opp_clk(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct clk *clk;
+
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+		clk = ERR_CAST(opp_table);
+		goto unlock;
+	}
+
+	clk = opp_table->clk;
+	if (IS_ERR(clk))
+		dev_err(dev, "%s: No clock available for the device\n",
+			__func__);
+
+unlock:
+	rcu_read_unlock();
+	return clk;
+}
+
+static int _set_opp_voltage(struct device *dev, struct regulator *reg,
+			    unsigned long u_volt, unsigned long u_volt_min,
+			    unsigned long u_volt_max)
+{
+	int ret;
+
+	/* Regulator not available for device */
+	if (IS_ERR(reg)) {
+		dev_dbg(dev, "%s: regulator not available: %ld\n", __func__,
+			PTR_ERR(reg));
+		return 0;
+	}
+
+	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min,
+		u_volt, u_volt_max);
+
+	ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt,
+					    u_volt_max);
+	if (ret)
+		dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
+			__func__, u_volt_min, u_volt, u_volt_max, ret);
+
+	return ret;
+}
+
+/**
+ * dev_pm_opp_set_rate() - Configure new OPP based on frequency
+ * @dev:	 device for which we do this operation
+ * @target_freq: frequency to achieve
+ *
+ * This configures the power-supplies and clock source to the levels specified
+ * by the OPP corresponding to the target_freq.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *old_opp, *opp;
+	struct regulator *reg;
+	struct clk *clk;
+	unsigned long freq, old_freq;
+	unsigned long u_volt, u_volt_min, u_volt_max;
+	unsigned long ou_volt, ou_volt_min, ou_volt_max;
+	int ret;
+
+	if (unlikely(!target_freq)) {
+		dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
+			target_freq);
+		return -EINVAL;
+	}
+
+	clk = _get_opp_clk(dev);
+	if (IS_ERR(clk))
+		return PTR_ERR(clk);
+
+	freq = clk_round_rate(clk, target_freq);
+	if ((long)freq <= 0)
+		freq = target_freq;
+
+	old_freq = clk_get_rate(clk);
+
+	/* Return early if nothing to do */
+	if (old_freq == freq) {
+		dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n",
+			__func__, freq);
+		return 0;
+	}
+
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+		rcu_read_unlock();
+		return PTR_ERR(opp_table);
+	}
+
+	old_opp = dev_pm_opp_find_freq_ceil(dev, &old_freq);
+	if (!IS_ERR(old_opp)) {
+		ou_volt = old_opp->u_volt;
+		ou_volt_min = old_opp->u_volt_min;
+		ou_volt_max = old_opp->u_volt_max;
+	} else {
+		dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n",
+			__func__, old_freq, PTR_ERR(old_opp));
+	}
+
+	opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+	if (IS_ERR(opp)) {
+		ret = PTR_ERR(opp);
+		dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
+			__func__, freq, ret);
+		rcu_read_unlock();
+		return ret;
+	}
+
+	u_volt = opp->u_volt;
+	u_volt_min = opp->u_volt_min;
+	u_volt_max = opp->u_volt_max;
+
+	reg = opp_table->regulator;
+
+	rcu_read_unlock();
+
+	/* Scaling up? Scale voltage before frequency */
+	if (freq > old_freq) {
+		ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
+				       u_volt_max);
+		if (ret)
+			goto restore_voltage;
+	}
+
+	/* Change frequency */
+
+	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
+		__func__, old_freq, freq);
+
+	ret = clk_set_rate(clk, freq);
+	if (ret) {
+		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+			ret);
+		goto restore_voltage;
+	}
+
+	/* Scaling down? Scale voltage after frequency */
+	if (freq < old_freq) {
+		ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
+				       u_volt_max);
+		if (ret)
+			goto restore_freq;
+	}
+
+	return 0;
+
+restore_freq:
+	if (clk_set_rate(clk, old_freq))
+		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+			__func__, old_freq);
+restore_voltage:
+	/* This shouldn't harm even if the voltages weren't updated earlier */
+	if (!IS_ERR(old_opp))
+		_set_opp_voltage(dev, reg, ou_volt, ou_volt_min, ou_volt_max);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
+
+/* OPP-dev Helpers */
+static void _kfree_opp_dev_rcu(struct rcu_head *head)
+{
+	struct opp_device *opp_dev;
+
+	opp_dev = container_of(head, struct opp_device, rcu_head);
+	kfree_rcu(opp_dev, rcu_head);
+}
+
+static void _remove_opp_dev(struct opp_device *opp_dev,
+			    struct opp_table *opp_table)
+{
+	opp_debug_unregister(opp_dev, opp_table);
+	list_del(&opp_dev->node);
+	call_srcu(&opp_table->srcu_head.srcu, &opp_dev->rcu_head,
+		  _kfree_opp_dev_rcu);
+}
+
+struct opp_device *_add_opp_dev(const struct device *dev,
+				struct opp_table *opp_table)
+{
+	struct opp_device *opp_dev;
+	int ret;
+
+	opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
+	if (!opp_dev)
+		return NULL;
+
+	/* Initialize opp-dev */
+	opp_dev->dev = dev;
+	list_add_rcu(&opp_dev->node, &opp_table->dev_list);
+
+	/* Create debugfs entries for the opp_table */
+	ret = opp_debug_register(opp_dev, opp_table);
+	if (ret)
+		dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
+			__func__, ret);
+
+	return opp_dev;
+}
+
+/**
+ * _add_opp_table() - Find OPP table or allocate a new one
+ * @dev:	device for which we do this operation
+ *
+ * It tries to find an existing table first, if it couldn't find one, it
+ * allocates a new OPP table and returns that.
+ *
+ * Return: valid opp_table pointer if success, else NULL.
+ */
+static struct opp_table *_add_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct opp_device *opp_dev;
+	struct device_node *np;
+	int ret;
+
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (!IS_ERR(opp_table))
+		return opp_table;
+
+	/*
+	 * Allocate a new OPP table. In the infrequent case where a new
+	 * device is needed to be added, we pay this penalty.
+	 */
+	opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL);
+	if (!opp_table)
+		return NULL;
+
+	INIT_LIST_HEAD(&opp_table->dev_list);
+
+	opp_dev = _add_opp_dev(dev, opp_table);
+	if (!opp_dev) {
+		kfree(opp_table);
+		return NULL;
+	}
+
+	/*
+	 * Only required for backward compatibility with v1 bindings, but isn't
+	 * harmful for other cases. And so we do it unconditionally.
+	 */
+	np = of_node_get(dev->of_node);
+	if (np) {
+		u32 val;
+
+		if (!of_property_read_u32(np, "clock-latency", &val))
+			opp_table->clock_latency_ns_max = val;
+		of_property_read_u32(np, "voltage-tolerance",
+				     &opp_table->voltage_tolerance_v1);
+		of_node_put(np);
+	}
+
+	/* Set regulator to a non-NULL error value */
+	opp_table->regulator = ERR_PTR(-ENXIO);
+
+	/* Find clk for the device */
+	opp_table->clk = clk_get(dev, NULL);
+	if (IS_ERR(opp_table->clk)) {
+		ret = PTR_ERR(opp_table->clk);
+		if (ret != -EPROBE_DEFER)
+			dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
+				ret);
+	}
+
+	srcu_init_notifier_head(&opp_table->srcu_head);
+	INIT_LIST_HEAD(&opp_table->opp_list);
+
+	/* Secure the device table modification */
+	list_add_rcu(&opp_table->node, &opp_tables);
+	return opp_table;
+}
+
+/**
+ * _kfree_device_rcu() - Free opp_table RCU handler
+ * @head:	RCU head
+ */
+static void _kfree_device_rcu(struct rcu_head *head)
+{
+	struct opp_table *opp_table = container_of(head, struct opp_table,
+						   rcu_head);
+
+	kfree_rcu(opp_table, rcu_head);
+}
+
+/**
+ * _remove_opp_table() - Removes a OPP table
+ * @opp_table: OPP table to be removed.
+ *
+ * Removes/frees OPP table if it doesn't contain any OPPs.
+ */
+static void _remove_opp_table(struct opp_table *opp_table)
+{
+	struct opp_device *opp_dev;
+
+	if (!list_empty(&opp_table->opp_list))
+		return;
+
+	if (opp_table->supported_hw)
+		return;
+
+	if (opp_table->prop_name)
+		return;
+
+	if (!IS_ERR(opp_table->regulator))
+		return;
+
+	/* Release clk */
+	if (!IS_ERR(opp_table->clk))
+		clk_put(opp_table->clk);
+
+	opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device,
+				   node);
+
+	_remove_opp_dev(opp_dev, opp_table);
+
+	/* dev_list must be empty now */
+	WARN_ON(!list_empty(&opp_table->dev_list));
+
+	list_del_rcu(&opp_table->node);
+	call_srcu(&opp_table->srcu_head.srcu, &opp_table->rcu_head,
+		  _kfree_device_rcu);
+}
+
+/**
+ * _kfree_opp_rcu() - Free OPP RCU handler
+ * @head:	RCU head
+ */
+static void _kfree_opp_rcu(struct rcu_head *head)
+{
+	struct dev_pm_opp *opp = container_of(head, struct dev_pm_opp, rcu_head);
+
+	kfree_rcu(opp, rcu_head);
+}
+
+/**
+ * _opp_remove()  - Remove an OPP from a table definition
+ * @opp_table:	points back to the opp_table struct this opp belongs to
+ * @opp:	pointer to the OPP to remove
+ * @notify:	OPP_EVENT_REMOVE notification should be sent or not
+ *
+ * This function removes an opp definition from the opp table.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * It is assumed that the caller holds required mutex for an RCU updater
+ * strategy.
+ */
+static void _opp_remove(struct opp_table *opp_table,
+			struct dev_pm_opp *opp, bool notify)
+{
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	if (notify)
+		srcu_notifier_call_chain(&opp_table->srcu_head,
+					 OPP_EVENT_REMOVE, opp);
+	opp_debug_remove_one(opp);
+	list_del_rcu(&opp->node);
+	call_srcu(&opp_table->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
+
+	_remove_opp_table(opp_table);
+}
+
+/**
+ * dev_pm_opp_remove()  - Remove an OPP from OPP table
+ * @dev:	device for which we do this operation
+ * @freq:	OPP to remove with matching 'freq'
+ *
+ * This function removes an opp from the opp table.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_remove(struct device *dev, unsigned long freq)
+{
+	struct dev_pm_opp *opp;
+	struct opp_table *opp_table;
+	bool found = false;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		goto unlock;
+
+	list_for_each_entry(opp, &opp_table->opp_list, node) {
+		if (opp->rate == freq) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n",
+			 __func__, freq);
+		goto unlock;
+	}
+
+	_opp_remove(opp_table, opp, true);
+unlock:
+	mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
+
+static struct dev_pm_opp *_allocate_opp(struct device *dev,
+					struct opp_table **opp_table)
+{
+	struct dev_pm_opp *opp;
+
+	/* allocate new OPP node */
+	opp = kzalloc(sizeof(*opp), GFP_KERNEL);
+	if (!opp)
+		return NULL;
+
+	INIT_LIST_HEAD(&opp->node);
+
+	*opp_table = _add_opp_table(dev);
+	if (!*opp_table) {
+		kfree(opp);
+		return NULL;
+	}
+
+	return opp;
+}
+
+static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
+					 struct opp_table *opp_table)
+{
+	struct regulator *reg = opp_table->regulator;
+
+	if (!IS_ERR(reg) &&
+	    !regulator_is_supported_voltage(reg, opp->u_volt_min,
+					    opp->u_volt_max)) {
+		pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+			__func__, opp->u_volt_min, opp->u_volt_max);
+		return false;
+	}
+
+	return true;
+}
+
+static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
+		    struct opp_table *opp_table)
+{
+	struct dev_pm_opp *opp;
+	struct list_head *head = &opp_table->opp_list;
+	int ret;
+
+	/*
+	 * Insert new OPP in order of increasing frequency and discard if
+	 * already present.
+	 *
+	 * Need to use &opp_table->opp_list in the condition part of the 'for'
+	 * loop, don't replace it with head otherwise it will become an infinite
+	 * loop.
+	 */
+	list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
+		if (new_opp->rate > opp->rate) {
+			head = &opp->node;
+			continue;
+		}
+
+		if (new_opp->rate < opp->rate)
+			break;
+
+		/* Duplicate OPPs */
+		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
+			 __func__, opp->rate, opp->u_volt, opp->available,
+			 new_opp->rate, new_opp->u_volt, new_opp->available);
+
+		return opp->available && new_opp->u_volt == opp->u_volt ?
+			0 : -EEXIST;
+	}
+
+	new_opp->opp_table = opp_table;
+	list_add_rcu(&new_opp->node, head);
+
+	ret = opp_debug_create_one(new_opp, opp_table);
+	if (ret)
+		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
+			__func__, ret);
+
+	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
+		new_opp->available = false;
+		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
+			 __func__, new_opp->rate);
+	}
+
+	return 0;
+}
+
+/**
+ * _opp_add_v1() - Allocate a OPP based on v1 bindings.
+ * @dev:	device for which we do this operation
+ * @freq:	Frequency in Hz for this OPP
+ * @u_volt:	Voltage in uVolts for this OPP
+ * @dynamic:	Dynamically added OPPs.
+ *
+ * This function adds an opp definition to the opp table and returns status.
+ * The opp is made available by default and it can be controlled using
+ * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
+ *
+ * NOTE: "dynamic" parameter impacts OPPs added by the dev_pm_opp_of_add_table
+ * and freed by dev_pm_opp_of_remove_table.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ */
+static int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
+		       bool dynamic)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *new_opp;
+	unsigned long tol;
+	int ret;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	new_opp = _allocate_opp(dev, &opp_table);
+	if (!new_opp) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* populate the opp table */
+	new_opp->rate = freq;
+	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
+	new_opp->u_volt = u_volt;
+	new_opp->u_volt_min = u_volt - tol;
+	new_opp->u_volt_max = u_volt + tol;
+	new_opp->available = true;
+	new_opp->dynamic = dynamic;
+
+	ret = _opp_add(dev, new_opp, opp_table);
+	if (ret)
+		goto free_opp;
+
+	mutex_unlock(&opp_table_lock);
+
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	srcu_notifier_call_chain(&opp_table->srcu_head, OPP_EVENT_ADD, new_opp);
+	return 0;
+
+free_opp:
+	_opp_remove(opp_table, new_opp, false);
+unlock:
+	mutex_unlock(&opp_table_lock);
+	return ret;
+}
+
+/* TODO: Support multiple regulators */
+static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
+			      struct opp_table *opp_table)
+{
+	u32 microvolt[3] = {0};
+	u32 val;
+	int count, ret;
+	struct property *prop = NULL;
+	char name[NAME_MAX];
+
+	/* Search for "opp-microvolt-<name>" */
+	if (opp_table->prop_name) {
+		snprintf(name, sizeof(name), "opp-microvolt-%s",
+			 opp_table->prop_name);
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (!prop) {
+		/* Search for "opp-microvolt" */
+		sprintf(name, "opp-microvolt");
+		prop = of_find_property(opp->np, name, NULL);
+
+		/* Missing property isn't a problem, but an invalid entry is */
+		if (!prop)
+			return 0;
+	}
+
+	count = of_property_count_u32_elems(opp->np, name);
+	if (count < 0) {
+		dev_err(dev, "%s: Invalid %s property (%d)\n",
+			__func__, name, count);
+		return count;
+	}
+
+	/* There can be one or three elements here */
+	if (count != 1 && count != 3) {
+		dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n",
+			__func__, name, count);
+		return -EINVAL;
+	}
+
+	ret = of_property_read_u32_array(opp->np, name, microvolt, count);
+	if (ret) {
+		dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
+		return -EINVAL;
+	}
+
+	opp->u_volt = microvolt[0];
+
+	if (count == 1) {
+		opp->u_volt_min = opp->u_volt;
+		opp->u_volt_max = opp->u_volt;
+	} else {
+		opp->u_volt_min = microvolt[1];
+		opp->u_volt_max = microvolt[2];
+	}
+
+	/* Search for "opp-microamp-<name>" */
+	prop = NULL;
+	if (opp_table->prop_name) {
+		snprintf(name, sizeof(name), "opp-microamp-%s",
+			 opp_table->prop_name);
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (!prop) {
+		/* Search for "opp-microamp" */
+		sprintf(name, "opp-microamp");
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (prop && !of_property_read_u32(opp->np, name, &val))
+		opp->u_amp = val;
+
+	return 0;
+}
+
+/**
+ * dev_pm_opp_set_supported_hw() - Set supported platforms
+ * @dev: Device for which supported-hw has to be set.
+ * @versions: Array of hierarchy of versions to match.
+ * @count: Number of elements in the array.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the hierarchy of versions it supports. OPP layer will then enable
+ * OPPs, which are available for those versions, based on its 'opp-supported-hw'
+ * property.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
+				unsigned int count)
+{
+	struct opp_table *opp_table;
+	int ret = 0;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _add_opp_table(dev);
+	if (!opp_table) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	/* Do we already have a version hierarchy associated with opp_table? */
+	if (opp_table->supported_hw) {
+		dev_err(dev, "%s: Already have supported hardware list\n",
+			__func__);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
+					GFP_KERNEL);
+	if (!opp_table->supported_hw) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	opp_table->supported_hw_count = count;
+	mutex_unlock(&opp_table_lock);
+	return 0;
+
+err:
+	_remove_opp_table(opp_table);
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
+
+/**
+ * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
+ * @dev: Device for which supported-hw has to be put.
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
+ * will not be freed.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_supported_hw(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "Failed to find opp_table: %ld\n",
+			PTR_ERR(opp_table));
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	if (!opp_table->supported_hw) {
+		dev_err(dev, "%s: Doesn't have supported hardware list\n",
+			__func__);
+		goto unlock;
+	}
+
+	kfree(opp_table->supported_hw);
+	opp_table->supported_hw = NULL;
+	opp_table->supported_hw_count = 0;
+
+	/* Try freeing opp_table if this was the last blocking resource */
+	_remove_opp_table(opp_table);
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
+
+/**
+ * dev_pm_opp_set_prop_name() - Set prop-extn name
+ * @dev: Device for which the prop-name has to be set.
+ * @name: name to postfix to properties.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the extn to be used for certain property names. The properties to
+ * which the extension will apply are opp-microvolt and opp-microamp. OPP core
+ * should postfix the property name with -<name> while looking for them.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+	struct opp_table *opp_table;
+	int ret = 0;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _add_opp_table(dev);
+	if (!opp_table) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	/* Do we already have a prop-name associated with opp_table? */
+	if (opp_table->prop_name) {
+		dev_err(dev, "%s: Already have prop-name %s\n", __func__,
+			opp_table->prop_name);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->prop_name = kstrdup(name, GFP_KERNEL);
+	if (!opp_table->prop_name) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	mutex_unlock(&opp_table_lock);
+	return 0;
+
+err:
+	_remove_opp_table(opp_table);
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
+
+/**
+ * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
+ * @dev: Device for which the prop-name has to be put.
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
+ * will not be freed.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_prop_name(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "Failed to find opp_table: %ld\n",
+			PTR_ERR(opp_table));
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	if (!opp_table->prop_name) {
+		dev_err(dev, "%s: Doesn't have a prop-name\n", __func__);
+		goto unlock;
+	}
+
+	kfree(opp_table->prop_name);
+	opp_table->prop_name = NULL;
+
+	/* Try freeing opp_table if this was the last blocking resource */
+	_remove_opp_table(opp_table);
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
+
+/**
+ * dev_pm_opp_set_regulator() - Set regulator name for the device
+ * @dev: Device for which regulator name is being set.
+ * @name: Name of the regulator.
+ *
+ * In order to support OPP switching, OPP layer needs to know the name of the
+ * device's regulator, as the core would be required to switch voltages as well.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+{
+	struct opp_table *opp_table;
+	struct regulator *reg;
+	int ret;
+
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _add_opp_table(dev);
+	if (!opp_table) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have a regulator set */
+	if (WARN_ON(!IS_ERR(opp_table->regulator))) {
+		ret = -EBUSY;
+		goto err;
+	}
+	/* Allocate the regulator */
+	reg = regulator_get_optional(dev, name);
+	if (IS_ERR(reg)) {
+		ret = PTR_ERR(reg);
+		if (ret != -EPROBE_DEFER)
+			dev_err(dev, "%s: no regulator (%s) found: %d\n",
+				__func__, name, ret);
+		goto err;
+	}
+
+	opp_table->regulator = reg;
+
+	mutex_unlock(&opp_table_lock);
+	return 0;
+
+err:
+	_remove_opp_table(opp_table);
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
+
+/**
+ * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
+ * @dev: Device for which regulator was set.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_regulator(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	mutex_lock(&opp_table_lock);
+
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "Failed to find opp_table: %ld\n",
+			PTR_ERR(opp_table));
+		goto unlock;
+	}
+
+	if (IS_ERR(opp_table->regulator)) {
+		dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	regulator_put(opp_table->regulator);
+	opp_table->regulator = ERR_PTR(-ENXIO);
+
+	/* Try freeing opp_table if this was the last blocking resource */
+	_remove_opp_table(opp_table);
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator);
+
+static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
+			      struct device_node *np)
+{
+	unsigned int count = opp_table->supported_hw_count;
+	u32 version;
+	int ret;
+
+	if (!opp_table->supported_hw)
+		return true;
+
+	while (count--) {
+		ret = of_property_read_u32_index(np, "opp-supported-hw", count,
+						 &version);
+		if (ret) {
+			dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n",
+				 __func__, count, ret);
+			return false;
+		}
+
+		/* Both of these are bitwise masks of the versions */
+		if (!(version & opp_table->supported_hw[count]))
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
+ * @dev:	device for which we do this operation
+ * @np:		device node
+ *
+ * This function adds an opp definition to the opp table and returns status. The
+ * opp can be controlled using dev_pm_opp_enable/disable functions and may be
+ * removed by dev_pm_opp_remove.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ * -EINVAL	Failed parsing the OPP node
+ */
+static int _opp_add_static_v2(struct device *dev, struct device_node *np)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *new_opp;
+	u64 rate;
+	u32 val;
+	int ret;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	new_opp = _allocate_opp(dev, &opp_table);
+	if (!new_opp) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	ret = of_property_read_u64(np, "opp-hz", &rate);
+	if (ret < 0) {
+		dev_err(dev, "%s: opp-hz not found\n", __func__);
+		goto free_opp;
+	}
+
+	/* Check if the OPP supports hardware's hierarchy of versions or not */
+	if (!_opp_is_supported(dev, opp_table, np)) {
+		dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
+		goto free_opp;
+	}
+
+	/*
+	 * Rate is defined as an unsigned long in clk API, and so casting
+	 * explicitly to its type. Must be fixed once rate is 64 bit
+	 * guaranteed in clk API.
+	 */
+	new_opp->rate = (unsigned long)rate;
+	new_opp->turbo = of_property_read_bool(np, "turbo-mode");
+
+	new_opp->np = np;
+	new_opp->dynamic = false;
+	new_opp->available = true;
+
+	if (!of_property_read_u32(np, "clock-latency-ns", &val))
+		new_opp->clock_latency_ns = val;
+
+	ret = opp_parse_supplies(new_opp, dev, opp_table);
+	if (ret)
+		goto free_opp;
+
+	ret = _opp_add(dev, new_opp, opp_table);
+	if (ret)
+		goto free_opp;
+
+	/* OPP to select on device suspend */
+	if (of_property_read_bool(np, "opp-suspend")) {
+		if (opp_table->suspend_opp) {
+			dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
+				 __func__, opp_table->suspend_opp->rate,
+				 new_opp->rate);
+		} else {
+			new_opp->suspend = true;
+			opp_table->suspend_opp = new_opp;
+		}
+	}
+
+	if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max)
+		opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
+
+	mutex_unlock(&opp_table_lock);
+
+	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
+		 __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt,
+		 new_opp->u_volt_min, new_opp->u_volt_max,
+		 new_opp->clock_latency_ns);
+
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	srcu_notifier_call_chain(&opp_table->srcu_head, OPP_EVENT_ADD, new_opp);
+	return 0;
+
+free_opp:
+	_opp_remove(opp_table, new_opp, false);
+unlock:
+	mutex_unlock(&opp_table_lock);
+	return ret;
+}
+
+/**
+ * dev_pm_opp_add()  - Add an OPP table from a table definitions
+ * @dev:	device for which we do this operation
+ * @freq:	Frequency in Hz for this OPP
+ * @u_volt:	Voltage in uVolts for this OPP
+ *
+ * This function adds an opp definition to the opp table and returns status.
+ * The opp is made available by default and it can be controlled using
+ * dev_pm_opp_enable/disable functions.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ */
+int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
+{
+	return _opp_add_v1(dev, freq, u_volt, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_add);
+
+/**
+ * _opp_set_availability() - helper to set the availability of an opp
+ * @dev:		device for which we do this operation
+ * @freq:		OPP frequency to modify availability
+ * @availability_req:	availability status requested for this opp
+ *
+ * Set the availability of an OPP with an RCU operation, opp_{enable,disable}
+ * share a common logic which is isolated here.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks to
+ * keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex locking or synchronize_rcu() blocking calls cannot be used.
+ */
+static int _opp_set_availability(struct device *dev, unsigned long freq,
+				 bool availability_req)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV);
+	int r = 0;
+
+	/* keep the node allocated */
+	new_opp = kmalloc(sizeof(*new_opp), GFP_KERNEL);
+	if (!new_opp)
+		return -ENOMEM;
+
+	mutex_lock(&opp_table_lock);
+
+	/* Find the opp_table */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		r = PTR_ERR(opp_table);
+		dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
+		goto unlock;
+	}
+
+	/* Do we have the frequency? */
+	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
+		if (tmp_opp->rate == freq) {
+			opp = tmp_opp;
+			break;
+		}
+	}
+	if (IS_ERR(opp)) {
+		r = PTR_ERR(opp);
+		goto unlock;
+	}
+
+	/* Is update really needed? */
+	if (opp->available == availability_req)
+		goto unlock;
+	/* copy the old data over */
+	*new_opp = *opp;
+
+	/* plug in new node */
+	new_opp->available = availability_req;
+
+	list_replace_rcu(&opp->node, &new_opp->node);
+	mutex_unlock(&opp_table_lock);
+	call_srcu(&opp_table->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
+
+	/* Notify the change of the OPP availability */
+	if (availability_req)
+		srcu_notifier_call_chain(&opp_table->srcu_head,
+					 OPP_EVENT_ENABLE, new_opp);
+	else
+		srcu_notifier_call_chain(&opp_table->srcu_head,
+					 OPP_EVENT_DISABLE, new_opp);
+
+	return 0;
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+	kfree(new_opp);
+	return r;
+}
+
+/**
+ * dev_pm_opp_enable() - Enable a specific OPP
+ * @dev:	device for which we do this operation
+ * @freq:	OPP frequency to enable
+ *
+ * Enables a provided opp. If the operation is valid, this returns 0, else the
+ * corresponding error value. It is meant to be used for users an OPP available
+ * after being temporarily made unavailable with dev_pm_opp_disable.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function indirectly uses RCU and mutex locks to keep the
+ * integrity of the internal data structures. Callers should ensure that
+ * this function is *NOT* called under RCU protection or in contexts where
+ * mutex locking or synchronize_rcu() blocking calls cannot be used.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+int dev_pm_opp_enable(struct device *dev, unsigned long freq)
+{
+	return _opp_set_availability(dev, freq, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
+
+/**
+ * dev_pm_opp_disable() - Disable a specific OPP
+ * @dev:	device for which we do this operation
+ * @freq:	OPP frequency to disable
+ *
+ * Disables a provided opp. If the operation is valid, this returns
+ * 0, else the corresponding error value. It is meant to be a temporary
+ * control by users to make this OPP not available until the circumstances are
+ * right to make it available again (with a call to dev_pm_opp_enable).
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function indirectly uses RCU and mutex locks to keep the
+ * integrity of the internal data structures. Callers should ensure that
+ * this function is *NOT* called under RCU protection or in contexts where
+ * mutex locking or synchronize_rcu() blocking calls cannot be used.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+int dev_pm_opp_disable(struct device *dev, unsigned long freq)
+{
+	return _opp_set_availability(dev, freq, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
+
+/**
+ * dev_pm_opp_get_notifier() - find notifier_head of the device with opp
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Return: pointer to  notifier head if found, otherwise -ENODEV or
+ * -EINVAL based on type of error casted as pointer. value must be checked
+ *  with IS_ERR to determine valid pointer or error result.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp_table is a
+ * RCU protected pointer. The reason for the same is that the opp pointer which
+ * is returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev)
+{
+	struct opp_table *opp_table = _find_opp_table(dev);
+
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table); /* matching type */
+
+	return &opp_table->srcu_head;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
+
+#ifdef CONFIG_OF
+/**
+ * dev_pm_opp_of_remove_table() - Free OPP table entries created from static DT
+ *				  entries
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Free OPPs created using static entries present in DT.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function indirectly uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_of_remove_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *opp, *tmp;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	/* Check for existing table for 'dev' */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int error = PTR_ERR(opp_table);
+
+		if (error != -ENODEV)
+			WARN(1, "%s: opp_table: %d\n",
+			     IS_ERR_OR_NULL(dev) ?
+					"Invalid device" : dev_name(dev),
+			     error);
+		goto unlock;
+	}
+
+	/* Find if opp_table manages a single device */
+	if (list_is_singular(&opp_table->dev_list)) {
+		/* Free static OPPs */
+		list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
+			if (!opp->dynamic)
+				_opp_remove(opp_table, opp, true);
+		}
+	} else {
+		_remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
+	}
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
+
+/* Returns opp descriptor node for a device, caller must do of_node_put() */
+struct device_node *_of_get_opp_desc_node(struct device *dev)
+{
+	/*
+	 * TODO: Support for multiple OPP tables.
+	 *
+	 * There should be only ONE phandle present in "operating-points-v2"
+	 * property.
+	 */
+
+	return of_parse_phandle(dev->of_node, "operating-points-v2", 0);
+}
+
+/* Initializes OPP tables based on new bindings */
+static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
+{
+	struct device_node *np;
+	struct opp_table *opp_table;
+	int ret = 0, count = 0;
+
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _managed_opp(opp_np);
+	if (opp_table) {
+		/* OPPs are already managed */
+		if (!_add_opp_dev(dev, opp_table))
+			ret = -ENOMEM;
+		mutex_unlock(&opp_table_lock);
+		return ret;
+	}
+	mutex_unlock(&opp_table_lock);
+
+	/* We have opp-table node now, iterate over it and add OPPs */
+	for_each_available_child_of_node(opp_np, np) {
+		count++;
+
+		ret = _opp_add_static_v2(dev, np);
+		if (ret) {
+			dev_err(dev, "%s: Failed to add OPP, %d\n", __func__,
+				ret);
+			goto free_table;
+		}
+	}
+
+	/* There should be one of more OPP defined */
+	if (WARN_ON(!count))
+		return -ENOENT;
+
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _find_opp_table(dev);
+	if (WARN_ON(IS_ERR(opp_table))) {
+		ret = PTR_ERR(opp_table);
+		mutex_unlock(&opp_table_lock);
+		goto free_table;
+	}
+
+	opp_table->np = opp_np;
+	opp_table->shared_opp = of_property_read_bool(opp_np, "opp-shared");
+
+	mutex_unlock(&opp_table_lock);
+
+	return 0;
+
+free_table:
+	dev_pm_opp_of_remove_table(dev);
+
+	return ret;
+}
+
+/* Initializes OPP tables based on old-deprecated bindings */
+static int _of_add_opp_table_v1(struct device *dev)
+{
+	const struct property *prop;
+	const __be32 *val;
+	int nr;
+
+	prop = of_find_property(dev->of_node, "operating-points", NULL);
+	if (!prop)
+		return -ENODEV;
+	if (!prop->value)
+		return -ENODATA;
+
+	/*
+	 * Each OPP is a set of tuples consisting of frequency and
+	 * voltage like <freq-kHz vol-uV>.
+	 */
+	nr = prop->length / sizeof(u32);
+	if (nr % 2) {
+		dev_err(dev, "%s: Invalid OPP table\n", __func__);
+		return -EINVAL;
+	}
+
+	val = prop->value;
+	while (nr) {
+		unsigned long freq = be32_to_cpup(val++) * 1000;
+		unsigned long volt = be32_to_cpup(val++);
+
+		if (_opp_add_v1(dev, freq, volt, false))
+			dev_warn(dev, "%s: Failed to add OPP %ld\n",
+				 __func__, freq);
+		nr -= 2;
+	}
+
+	return 0;
+}
+
+/**
+ * dev_pm_opp_of_add_table() - Initialize opp table from device tree
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Register the initial OPP table with the OPP library for given device.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function indirectly uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ * -ENODEV	when 'operating-points' property is not found or is invalid data
+ *		in device node.
+ * -ENODATA	when empty 'operating-points' property is found
+ * -EINVAL	when invalid entries are found in opp-v2 table
+ */
+int dev_pm_opp_of_add_table(struct device *dev)
+{
+	struct device_node *opp_np;
+	int ret;
+
+	/*
+	 * OPPs have two version of bindings now. The older one is deprecated,
+	 * try for the new binding first.
+	 */
+	opp_np = _of_get_opp_desc_node(dev);
+	if (!opp_np) {
+		/*
+		 * Try old-deprecated bindings for backward compatibility with
+		 * older dtbs.
+		 */
+		return _of_add_opp_table_v1(dev);
+	}
+
+	ret = _of_add_opp_table_v2(dev, opp_np);
+	of_node_put(opp_np);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
+#endif
diff --git a/drivers/base/power/opp/cpu.c b/drivers/base/power/opp/cpu.c
new file mode 100644
index 000000000000..ba2bdbd932ef
--- /dev/null
+++ b/drivers/base/power/opp/cpu.c
@@ -0,0 +1,271 @@
+/*
+ * Generic OPP helper interface for CPU device
+ *
+ * Copyright (C) 2009-2014 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+
+#include "opp.h"
+
+#ifdef CONFIG_CPU_FREQ
+
+/**
+ * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device
+ * @dev:	device for which we do this operation
+ * @table:	Cpufreq table returned back to caller
+ *
+ * Generate a cpufreq table for a provided device- this assumes that the
+ * opp table is already initialized and ready for usage.
+ *
+ * This function allocates required memory for the cpufreq table. It is
+ * expected that the caller does the required maintenance such as freeing
+ * the table as required.
+ *
+ * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM
+ * if no memory available for the operation (table is not populated), returns 0
+ * if successful and table is populated.
+ *
+ * WARNING: It is  important for the callers to ensure refreshing their copy of
+ * the table if any of the mentioned functions have been invoked in the interim.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Since we just use the regular accessor functions to access the internal data
+ * structures, we use RCU read lock inside this function. As a result, users of
+ * this function DONOT need to use explicit locks for invoking.
+ */
+int dev_pm_opp_init_cpufreq_table(struct device *dev,
+				  struct cpufreq_frequency_table **table)
+{
+	struct dev_pm_opp *opp;
+	struct cpufreq_frequency_table *freq_table = NULL;
+	int i, max_opps, ret = 0;
+	unsigned long rate;
+
+	rcu_read_lock();
+
+	max_opps = dev_pm_opp_get_opp_count(dev);
+	if (max_opps <= 0) {
+		ret = max_opps ? max_opps : -ENODATA;
+		goto out;
+	}
+
+	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
+	if (!freq_table) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0, rate = 0; i < max_opps; i++, rate++) {
+		/* find next rate */
+		opp = dev_pm_opp_find_freq_ceil(dev, &rate);
+		if (IS_ERR(opp)) {
+			ret = PTR_ERR(opp);
+			goto out;
+		}
+		freq_table[i].driver_data = i;
+		freq_table[i].frequency = rate / 1000;
+
+		/* Is Boost/turbo opp ? */
+		if (dev_pm_opp_is_turbo(opp))
+			freq_table[i].flags = CPUFREQ_BOOST_FREQ;
+	}
+
+	freq_table[i].driver_data = i;
+	freq_table[i].frequency = CPUFREQ_TABLE_END;
+
+	*table = &freq_table[0];
+
+out:
+	rcu_read_unlock();
+	if (ret)
+		kfree(freq_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table);
+
+/**
+ * dev_pm_opp_free_cpufreq_table() - free the cpufreq table
+ * @dev:	device for which we do this operation
+ * @table:	table to free
+ *
+ * Free up the table allocated by dev_pm_opp_init_cpufreq_table
+ */
+void dev_pm_opp_free_cpufreq_table(struct device *dev,
+				   struct cpufreq_frequency_table **table)
+{
+	if (!table)
+		return;
+
+	kfree(*table);
+	*table = NULL;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
+#endif	/* CONFIG_CPU_FREQ */
+
+/* Required only for V1 bindings, as v2 can manage it from DT itself */
+int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+	struct opp_device *opp_dev;
+	struct opp_table *opp_table;
+	struct device *dev;
+	int cpu, ret = 0;
+
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _find_opp_table(cpu_dev);
+	if (IS_ERR(opp_table)) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	for_each_cpu(cpu, cpumask) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		dev = get_cpu_device(cpu);
+		if (!dev) {
+			dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
+				__func__, cpu);
+			continue;
+		}
+
+		opp_dev = _add_opp_dev(dev, opp_table);
+		if (!opp_dev) {
+			dev_err(dev, "%s: failed to add opp-dev for cpu%d device\n",
+				__func__, cpu);
+			continue;
+		}
+	}
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_sharing_cpus);
+
+#ifdef CONFIG_OF
+void dev_pm_opp_of_cpumask_remove_table(cpumask_var_t cpumask)
+{
+	struct device *cpu_dev;
+	int cpu;
+
+	WARN_ON(cpumask_empty(cpumask));
+
+	for_each_cpu(cpu, cpumask) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			continue;
+		}
+
+		dev_pm_opp_of_remove_table(cpu_dev);
+	}
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table);
+
+int dev_pm_opp_of_cpumask_add_table(cpumask_var_t cpumask)
+{
+	struct device *cpu_dev;
+	int cpu, ret = 0;
+
+	WARN_ON(cpumask_empty(cpumask));
+
+	for_each_cpu(cpu, cpumask) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			continue;
+		}
+
+		ret = dev_pm_opp_of_add_table(cpu_dev);
+		if (ret) {
+			pr_err("%s: couldn't find opp table for cpu:%d, %d\n",
+			       __func__, cpu, ret);
+
+			/* Free all other OPPs */
+			dev_pm_opp_of_cpumask_remove_table(cpumask);
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table);
+
+/*
+ * Works only for OPP v2 bindings.
+ *
+ * Returns -ENOENT if operating-points-v2 bindings aren't supported.
+ */
+int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+	struct device_node *np, *tmp_np;
+	struct device *tcpu_dev;
+	int cpu, ret = 0;
+
+	/* Get OPP descriptor node */
+	np = _of_get_opp_desc_node(cpu_dev);
+	if (!np) {
+		dev_dbg(cpu_dev, "%s: Couldn't find cpu_dev node.\n", __func__);
+		return -ENOENT;
+	}
+
+	cpumask_set_cpu(cpu_dev->id, cpumask);
+
+	/* OPPs are shared ? */
+	if (!of_property_read_bool(np, "opp-shared"))
+		goto put_cpu_node;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		tcpu_dev = get_cpu_device(cpu);
+		if (!tcpu_dev) {
+			dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
+				__func__, cpu);
+			ret = -ENODEV;
+			goto put_cpu_node;
+		}
+
+		/* Get OPP descriptor node */
+		tmp_np = _of_get_opp_desc_node(tcpu_dev);
+		if (!tmp_np) {
+			dev_err(tcpu_dev, "%s: Couldn't find tcpu_dev node.\n",
+				__func__);
+			ret = -ENOENT;
+			goto put_cpu_node;
+		}
+
+		/* CPUs are sharing opp node */
+		if (np == tmp_np)
+			cpumask_set_cpu(cpu, cpumask);
+
+		of_node_put(tmp_np);
+	}
+
+put_cpu_node:
+	of_node_put(np);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus);
+#endif
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
new file mode 100644
index 000000000000..ef1ae6b52042
--- /dev/null
+++ b/drivers/base/power/opp/debugfs.c
@@ -0,0 +1,218 @@
+/*
+ * Generic OPP debugfs interface
+ *
+ * Copyright (C) 2015-2016 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/limits.h>
+
+#include "opp.h"
+
+static struct dentry *rootdir;
+
+static void opp_set_dev_name(const struct device *dev, char *name)
+{
+	if (dev->parent)
+		snprintf(name, NAME_MAX, "%s-%s", dev_name(dev->parent),
+			 dev_name(dev));
+	else
+		snprintf(name, NAME_MAX, "%s", dev_name(dev));
+}
+
+void opp_debug_remove_one(struct dev_pm_opp *opp)
+{
+	debugfs_remove_recursive(opp->dentry);
+}
+
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
+{
+	struct dentry *pdentry = opp_table->dentry;
+	struct dentry *d;
+	char name[25];	/* 20 chars for 64 bit value + 5 (opp:\0) */
+
+	/* Rate is unique to each OPP, use it to give opp-name */
+	snprintf(name, sizeof(name), "opp:%lu", opp->rate);
+
+	/* Create per-opp directory */
+	d = debugfs_create_dir(name, pdentry);
+	if (!d)
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
+				  &opp->clock_latency_ns))
+		return -ENOMEM;
+
+	opp->dentry = d;
+	return 0;
+}
+
+static int opp_list_debug_create_dir(struct opp_device *opp_dev,
+				     struct opp_table *opp_table)
+{
+	const struct device *dev = opp_dev->dev;
+	struct dentry *d;
+
+	opp_set_dev_name(dev, opp_table->dentry_name);
+
+	/* Create device specific directory */
+	d = debugfs_create_dir(opp_table->dentry_name, rootdir);
+	if (!d) {
+		dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
+		return -ENOMEM;
+	}
+
+	opp_dev->dentry = d;
+	opp_table->dentry = d;
+
+	return 0;
+}
+
+static int opp_list_debug_create_link(struct opp_device *opp_dev,
+				      struct opp_table *opp_table)
+{
+	const struct device *dev = opp_dev->dev;
+	char name[NAME_MAX];
+	struct dentry *d;
+
+	opp_set_dev_name(opp_dev->dev, name);
+
+	/* Create device specific directory link */
+	d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
+	if (!d) {
+		dev_err(dev, "%s: Failed to create link\n", __func__);
+		return -ENOMEM;
+	}
+
+	opp_dev->dentry = d;
+
+	return 0;
+}
+
+/**
+ * opp_debug_register - add a device opp node to the debugfs 'opp' directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being added
+ *
+ * Dynamically adds device specific directory in debugfs 'opp' directory. If the
+ * device-opp is shared with other devices, then links will be created for all
+ * devices except the first.
+ *
+ * Return: 0 on success, otherwise negative error.
+ */
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
+{
+	if (!rootdir) {
+		pr_debug("%s: Uninitialized rootdir\n", __func__);
+		return -EINVAL;
+	}
+
+	if (opp_table->dentry)
+		return opp_list_debug_create_link(opp_dev, opp_table);
+
+	return opp_list_debug_create_dir(opp_dev, opp_table);
+}
+
+static void opp_migrate_dentry(struct opp_device *opp_dev,
+			       struct opp_table *opp_table)
+{
+	struct opp_device *new_dev;
+	const struct device *dev;
+	struct dentry *dentry;
+
+	/* Look for next opp-dev */
+	list_for_each_entry(new_dev, &opp_table->dev_list, node)
+		if (new_dev != opp_dev)
+			break;
+
+	/* new_dev is guaranteed to be valid here */
+	dev = new_dev->dev;
+	debugfs_remove_recursive(new_dev->dentry);
+
+	opp_set_dev_name(dev, opp_table->dentry_name);
+
+	dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
+				opp_table->dentry_name);
+	if (!dentry) {
+		dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
+			__func__, dev_name(opp_dev->dev), dev_name(dev));
+		return;
+	}
+
+	new_dev->dentry = dentry;
+	opp_table->dentry = dentry;
+}
+
+/**
+ * opp_debug_unregister - remove a device opp node from debugfs opp directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being removed
+ *
+ * Dynamically removes device specific directory from debugfs 'opp' directory.
+ */
+void opp_debug_unregister(struct opp_device *opp_dev,
+			  struct opp_table *opp_table)
+{
+	if (opp_dev->dentry == opp_table->dentry) {
+		/* Move the real dentry object under another device */
+		if (!list_is_singular(&opp_table->dev_list)) {
+			opp_migrate_dentry(opp_dev, opp_table);
+			goto out;
+		}
+		opp_table->dentry = NULL;
+	}
+
+	debugfs_remove_recursive(opp_dev->dentry);
+
+out:
+	opp_dev->dentry = NULL;
+}
+
+static int __init opp_debug_init(void)
+{
+	/* Create /sys/kernel/debug/opp directory */
+	rootdir = debugfs_create_dir("opp", NULL);
+	if (!rootdir) {
+		pr_err("%s: Failed to create root directory\n", __func__);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+core_initcall(opp_debug_init);
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
new file mode 100644
index 000000000000..f67f806fcf3a
--- /dev/null
+++ b/drivers/base/power/opp/opp.h
@@ -0,0 +1,207 @@
+/*
+ * Generic OPP Interface
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __DRIVER_OPP_H__
+#define __DRIVER_OPP_H__
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/limits.h>
+#include <linux/pm_opp.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+struct clk;
+struct regulator;
+
+/* Lock to allow exclusive modification to the device and opp lists */
+extern struct mutex opp_table_lock;
+
+/*
+ * Internal data structure organization with the OPP layer library is as
+ * follows:
+ * opp_tables (root)
+ *	|- device 1 (represents voltage domain 1)
+ *	|	|- opp 1 (availability, freq, voltage)
+ *	|	|- opp 2 ..
+ *	...	...
+ *	|	`- opp n ..
+ *	|- device 2 (represents the next voltage domain)
+ *	...
+ *	`- device m (represents mth voltage domain)
+ * device 1, 2.. are represented by opp_table structure while each opp
+ * is represented by the opp structure.
+ */
+
+/**
+ * struct dev_pm_opp - Generic OPP description structure
+ * @node:	opp table node. The nodes are maintained throughout the lifetime
+ *		of boot. It is expected only an optimal set of OPPs are
+ *		added to the library by the SoC framework.
+ *		RCU usage: opp table is traversed with RCU locks. node
+ *		modification is possible realtime, hence the modifications
+ *		are protected by the opp_table_lock for integrity.
+ *		IMPORTANT: the opp nodes should be maintained in increasing
+ *		order.
+ * @available:	true/false - marks if this OPP as available or not
+ * @dynamic:	not-created from static DT entries.
+ * @turbo:	true if turbo (boost) OPP
+ * @suspend:	true if suspend OPP
+ * @rate:	Frequency in hertz
+ * @u_volt:	Target voltage in microvolts corresponding to this OPP
+ * @u_volt_min:	Minimum voltage in microvolts corresponding to this OPP
+ * @u_volt_max:	Maximum voltage in microvolts corresponding to this OPP
+ * @u_amp:	Maximum current drawn by the device in microamperes
+ * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
+ *		frequency from any other OPP's frequency.
+ * @opp_table:	points back to the opp_table struct this opp belongs to
+ * @rcu_head:	RCU callback head used for deferred freeing
+ * @np:		OPP's device node.
+ * @dentry:	debugfs dentry pointer (per opp)
+ *
+ * This structure stores the OPP information for a given device.
+ */
+struct dev_pm_opp {
+	struct list_head node;
+
+	bool available;
+	bool dynamic;
+	bool turbo;
+	bool suspend;
+	unsigned long rate;
+
+	unsigned long u_volt;
+	unsigned long u_volt_min;
+	unsigned long u_volt_max;
+	unsigned long u_amp;
+	unsigned long clock_latency_ns;
+
+	struct opp_table *opp_table;
+	struct rcu_head rcu_head;
+
+	struct device_node *np;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+#endif
+};
+
+/**
+ * struct opp_device - devices managed by 'struct opp_table'
+ * @node:	list node
+ * @dev:	device to which the struct object belongs
+ * @rcu_head:	RCU callback head used for deferred freeing
+ * @dentry:	debugfs dentry pointer (per device)
+ *
+ * This is an internal data structure maintaining the devices that are managed
+ * by 'struct opp_table'.
+ */
+struct opp_device {
+	struct list_head node;
+	const struct device *dev;
+	struct rcu_head rcu_head;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+#endif
+};
+
+/**
+ * struct opp_table - Device opp structure
+ * @node:	table node - contains the devices with OPPs that
+ *		have been registered. Nodes once added are not modified in this
+ *		table.
+ *		RCU usage: nodes are not modified in the table of opp_table,
+ *		however addition is possible and is secured by opp_table_lock
+ * @srcu_head:	notifier head to notify the OPP availability changes.
+ * @rcu_head:	RCU callback head used for deferred freeing
+ * @dev_list:	list of devices that share these OPPs
+ * @opp_list:	table of opps
+ * @np:		struct device_node pointer for opp's DT node.
+ * @clock_latency_ns_max: Max clock latency in nanoseconds.
+ * @shared_opp: OPP is shared between multiple devices.
+ * @suspend_opp: Pointer to OPP to be used during device suspend.
+ * @supported_hw: Array of version number to support.
+ * @supported_hw_count: Number of elements in supported_hw array.
+ * @prop_name: A name to postfix to many DT properties, while parsing them.
+ * @clk: Device's clock handle
+ * @regulator: Supply regulator
+ * @dentry:	debugfs dentry pointer of the real device directory (not links).
+ * @dentry_name: Name of the real dentry.
+ *
+ * @voltage_tolerance_v1: In percentage, for v1 bindings only.
+ *
+ * This is an internal data structure maintaining the link to opps attached to
+ * a device. This structure is not meant to be shared to users as it is
+ * meant for book keeping and private to OPP library.
+ *
+ * Because the opp structures can be used from both rcu and srcu readers, we
+ * need to wait for the grace period of both of them before freeing any
+ * resources. And so we have used kfree_rcu() from within call_srcu() handlers.
+ */
+struct opp_table {
+	struct list_head node;
+
+	struct srcu_notifier_head srcu_head;
+	struct rcu_head rcu_head;
+	struct list_head dev_list;
+	struct list_head opp_list;
+
+	struct device_node *np;
+	unsigned long clock_latency_ns_max;
+
+	/* For backward compatibility with v1 bindings */
+	unsigned int voltage_tolerance_v1;
+
+	bool shared_opp;
+	struct dev_pm_opp *suspend_opp;
+
+	unsigned int *supported_hw;
+	unsigned int supported_hw_count;
+	const char *prop_name;
+	struct clk *clk;
+	struct regulator *regulator;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+	char dentry_name[NAME_MAX];
+#endif
+};
+
+/* Routines internal to opp core */
+struct opp_table *_find_opp_table(struct device *dev);
+struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
+struct device_node *_of_get_opp_desc_node(struct device *dev);
+
+#ifdef CONFIG_DEBUG_FS
+void opp_debug_remove_one(struct dev_pm_opp *opp);
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
+void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
+#else
+static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
+
+static inline int opp_debug_create_one(struct dev_pm_opp *opp,
+				       struct opp_table *opp_table)
+{ return 0; }
+static inline int opp_debug_register(struct opp_device *opp_dev,
+				     struct opp_table *opp_table)
+{ return 0; }
+
+static inline void opp_debug_unregister(struct opp_device *opp_dev,
+					struct opp_table *opp_table)
+{ }
+#endif		/* DEBUG_FS */
+
+#endif		/* __DRIVER_OPP_H__ */
diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index a13587b5c2be..03558c1d9914 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -122,16 +122,16 @@ struct regmap {
 	unsigned int num_reg_defaults_raw;
 
 	/* if set, only the cache is modified not the HW */
-	u32 cache_only;
+	bool cache_only;
 	/* if set, only the HW is modified not the cache */
-	u32 cache_bypass;
+	bool cache_bypass;
 	/* if set, remember to free reg_defaults_raw */
 	bool cache_free;
 
 	struct reg_default *reg_defaults;
 	const void *reg_defaults_raw;
 	void *cache;
-	u32 cache_dirty;
+	bool cache_dirty;
 
 	struct reg_default *patch;
 	int patch_regs;
diff --git a/drivers/base/regmap/regcache-lzo.c b/drivers/base/regmap/regcache-lzo.c
index 2d53f6f138e1..736e0d378567 100644
--- a/drivers/base/regmap/regcache-lzo.c
+++ b/drivers/base/regmap/regcache-lzo.c
@@ -355,9 +355,9 @@ static int regcache_lzo_sync(struct regmap *map, unsigned int min,
 		if (ret > 0 && val == map->reg_defaults[ret].def)
 			continue;
 
-		map->cache_bypass = 1;
+		map->cache_bypass = true;
 		ret = _regmap_write(map, i, val);
-		map->cache_bypass = 0;
+		map->cache_bypass = false;
 		if (ret)
 			return ret;
 		dev_dbg(map->dev, "Synced register %#x, value %#x\n",
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index 7eb7b3b98794..d58d74dc372c 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -54,11 +54,11 @@ static int regcache_hw_init(struct regmap *map)
 		return -ENOMEM;
 
 	if (!map->reg_defaults_raw) {
-		u32 cache_bypass = map->cache_bypass;
+		bool cache_bypass = map->cache_bypass;
 		dev_warn(map->dev, "No cache defaults, reading back from HW\n");
 
 		/* Bypass the cache access till data read from HW*/
-		map->cache_bypass = 1;
+		map->cache_bypass = true;
 		tmp_buf = kmalloc(map->cache_size_raw, GFP_KERNEL);
 		if (!tmp_buf) {
 			ret = -ENOMEM;
@@ -271,9 +271,9 @@ static int regcache_default_sync(struct regmap *map, unsigned int min,
 		if (ret >= 0 && val == map->reg_defaults[ret].def)
 			continue;
 
-		map->cache_bypass = 1;
+		map->cache_bypass = true;
 		ret = _regmap_write(map, reg, val);
-		map->cache_bypass = 0;
+		map->cache_bypass = false;
 		if (ret) {
 			dev_err(map->dev, "Unable to sync register %#x. %d\n",
 				reg, ret);
@@ -301,7 +301,7 @@ int regcache_sync(struct regmap *map)
 	int ret = 0;
 	unsigned int i;
 	const char *name;
-	unsigned int bypass;
+	bool bypass;
 
 	BUG_ON(!map->cache_ops);
 
@@ -319,7 +319,7 @@ int regcache_sync(struct regmap *map)
 	map->async = true;
 
 	/* Apply any patch first */
-	map->cache_bypass = 1;
+	map->cache_bypass = true;
 	for (i = 0; i < map->patch_regs; i++) {
 		ret = _regmap_write(map, map->patch[i].reg, map->patch[i].def);
 		if (ret != 0) {
@@ -328,7 +328,7 @@ int regcache_sync(struct regmap *map)
 			goto out;
 		}
 	}
-	map->cache_bypass = 0;
+	map->cache_bypass = false;
 
 	if (map->cache_ops->sync)
 		ret = map->cache_ops->sync(map, 0, map->max_register);
@@ -369,7 +369,7 @@ int regcache_sync_region(struct regmap *map, unsigned int min,
 {
 	int ret = 0;
 	const char *name;
-	unsigned int bypass;
+	bool bypass;
 
 	BUG_ON(!map->cache_ops);
 
@@ -619,11 +619,11 @@ static int regcache_sync_block_single(struct regmap *map, void *block,
 		if (ret >= 0 && val == map->reg_defaults[ret].def)
 			continue;
 
-		map->cache_bypass = 1;
+		map->cache_bypass = true;
 
 		ret = _regmap_write(map, regtmp, val);
 
-		map->cache_bypass = 0;
+		map->cache_bypass = false;
 		if (ret != 0) {
 			dev_err(map->dev, "Unable to sync register %#x. %d\n",
 				regtmp, ret);
@@ -650,14 +650,14 @@ static int regcache_sync_block_raw_flush(struct regmap *map, const void **data,
 	dev_dbg(map->dev, "Writing %zu bytes for %d registers from 0x%x-0x%x\n",
 		count * val_bytes, count, base, cur - map->reg_stride);
 
-	map->cache_bypass = 1;
+	map->cache_bypass = true;
 
 	ret = _regmap_raw_write(map, base, *data, count * val_bytes);
 	if (ret)
 		dev_err(map->dev, "Unable to sync registers %#x-%#x. %d\n",
 			base, cur - map->reg_stride, ret);
 
-	map->cache_bypass = 0;
+	map->cache_bypass = false;
 
 	*data = NULL;
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b905e9888b88..efd19c2da9c2 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -38,6 +38,7 @@
 #include <linux/mutex.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
 #include <net/tcp.h>
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 81fde9ef7f8e..a1518539b858 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2359,7 +2359,7 @@ static void drbd_cleanup(void)
  * @congested_data:	User data
  * @bdi_bits:		Bits the BDI flusher thread is currently interested in
  *
- * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ * Returns 1<<WB_async_congested and/or 1<<WB_sync_congested if we are congested.
  */
 static int drbd_congested(void *congested_data, int bdi_bits)
 {
@@ -2376,14 +2376,14 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 	}
 
 	if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
-		r |= (1 << BDI_async_congested);
+		r |= (1 << WB_async_congested);
 		/* Without good local data, we would need to read from remote,
 		 * and that would need the worker thread as well, which is
 		 * currently blocked waiting for that usermode helper to
 		 * finish.
 		 */
 		if (!get_ldev_if_state(device, D_UP_TO_DATE))
-			r |= (1 << BDI_sync_congested);
+			r |= (1 << WB_sync_congested);
 		else
 			put_ldev(device);
 		r &= bdi_bits;
@@ -2399,9 +2399,9 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 			reason = 'b';
 	}
 
-	if (bdi_bits & (1 << BDI_async_congested) &&
+	if (bdi_bits & (1 << WB_async_congested) &&
 	    test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
-		r |= (1 << BDI_async_congested);
+		r |= (1 << WB_async_congested);
 		reason = reason == 'b' ? 'a' : 'n';
 	}
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 09e628dafd9d..4c20c228184c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -61,6 +61,7 @@
 #include <linux/freezer.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi.h>
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 5fc291c6157e..60316fbaf295 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/raw.h>
 #include <linux/capability.h>
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index cdce92ae2e8b..476079fccf33 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -1,6 +1,5 @@
 # CPUfreq core
 obj-$(CONFIG_CPU_FREQ)			+= cpufreq.o freq_table.o
-obj-$(CONFIG_PM_OPP)			+= cpufreq_opp.o
 
 # CPUfreq stats
 obj-$(CONFIG_CPU_FREQ_STAT)             += cpufreq_stats.o
diff --git a/drivers/cpufreq/arm_big_little.h b/drivers/cpufreq/arm_big_little.h
index a211f7db9d32..b88889d9387e 100644
--- a/drivers/cpufreq/arm_big_little.h
+++ b/drivers/cpufreq/arm_big_little.h
@@ -28,7 +28,7 @@ struct cpufreq_arm_bL_ops {
 
 	/*
 	 * This must set opp table for cpu_dev in a similar way as done by
-	 * of_init_opp_table().
+	 * dev_pm_opp_of_add_table().
 	 */
 	int (*init_opp_table)(struct device *cpu_dev);
 
diff --git a/drivers/cpufreq/arm_big_little_dt.c b/drivers/cpufreq/arm_big_little_dt.c
index 36d91dba2965..16ddeefe9443 100644
--- a/drivers/cpufreq/arm_big_little_dt.c
+++ b/drivers/cpufreq/arm_big_little_dt.c
@@ -54,7 +54,7 @@ static int dt_init_opp_table(struct device *cpu_dev)
 		return -ENOENT;
 	}
 
-	ret = of_init_opp_table(cpu_dev);
+	ret = dev_pm_opp_of_add_table(cpu_dev);
 	of_node_put(np);
 
 	return ret;
@@ -82,7 +82,7 @@ static struct cpufreq_arm_bL_ops dt_bL_ops = {
 	.name	= "dt-bl",
 	.get_transition_latency = dt_get_transition_latency,
 	.init_opp_table = dt_init_opp_table,
-	.free_opp_table = of_free_opp_table,
+	.free_opp_table = dev_pm_opp_of_remove_table,
 };
 
 static int generic_bL_probe(struct platform_device *pdev)
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 663045ce6fac..64ccbc4216be 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -31,185 +31,180 @@
 
 struct private_data {
 	struct device *cpu_dev;
-	struct regulator *cpu_reg;
 	struct thermal_cooling_device *cdev;
-	unsigned int voltage_tolerance; /* in percentage */
+	const char *reg_name;
+};
+
+static struct freq_attr *cpufreq_dt_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,   /* Extra space for boost-attr if required */
+	NULL,
 };
 
 static int set_target(struct cpufreq_policy *policy, unsigned int index)
 {
-	struct dev_pm_opp *opp;
-	struct cpufreq_frequency_table *freq_table = policy->freq_table;
-	struct clk *cpu_clk = policy->clk;
 	struct private_data *priv = policy->driver_data;
-	struct device *cpu_dev = priv->cpu_dev;
-	struct regulator *cpu_reg = priv->cpu_reg;
-	unsigned long volt = 0, volt_old = 0, tol = 0;
-	unsigned int old_freq, new_freq;
-	long freq_Hz, freq_exact;
-	int ret;
 
-	freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
-	if (freq_Hz <= 0)
-		freq_Hz = freq_table[index].frequency * 1000;
-
-	freq_exact = freq_Hz;
-	new_freq = freq_Hz / 1000;
-	old_freq = clk_get_rate(cpu_clk) / 1000;
+	return dev_pm_opp_set_rate(priv->cpu_dev,
+				   policy->freq_table[index].frequency * 1000);
+}
 
-	if (!IS_ERR(cpu_reg)) {
-		unsigned long opp_freq;
+/*
+ * An earlier version of opp-v1 bindings used to name the regulator
+ * "cpu0-supply", we still need to handle that for backwards compatibility.
+ */
+static const char *find_supply_name(struct device *dev)
+{
+	struct device_node *np;
+	struct property *pp;
+	int cpu = dev->id;
+	const char *name = NULL;
 
-		rcu_read_lock();
-		opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_Hz);
-		if (IS_ERR(opp)) {
-			rcu_read_unlock();
-			dev_err(cpu_dev, "failed to find OPP for %ld\n",
-				freq_Hz);
-			return PTR_ERR(opp);
-		}
-		volt = dev_pm_opp_get_voltage(opp);
-		opp_freq = dev_pm_opp_get_freq(opp);
-		rcu_read_unlock();
-		tol = volt * priv->voltage_tolerance / 100;
-		volt_old = regulator_get_voltage(cpu_reg);
-		dev_dbg(cpu_dev, "Found OPP: %ld kHz, %ld uV\n",
-			opp_freq / 1000, volt);
-	}
+	np = of_node_get(dev->of_node);
 
-	dev_dbg(cpu_dev, "%u MHz, %ld mV --> %u MHz, %ld mV\n",
-		old_freq / 1000, (volt_old > 0) ? volt_old / 1000 : -1,
-		new_freq / 1000, volt ? volt / 1000 : -1);
+	/* This must be valid for sure */
+	if (WARN_ON(!np))
+		return NULL;
 
-	/* scaling up?  scale voltage before frequency */
-	if (!IS_ERR(cpu_reg) && new_freq > old_freq) {
-		ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
-		if (ret) {
-			dev_err(cpu_dev, "failed to scale voltage up: %d\n",
-				ret);
-			return ret;
+	/* Try "cpu0" for older DTs */
+	if (!cpu) {
+		pp = of_find_property(np, "cpu0-supply", NULL);
+		if (pp) {
+			name = "cpu0";
+			goto node_put;
 		}
 	}
 
-	ret = clk_set_rate(cpu_clk, freq_exact);
-	if (ret) {
-		dev_err(cpu_dev, "failed to set clock rate: %d\n", ret);
-		if (!IS_ERR(cpu_reg) && volt_old > 0)
-			regulator_set_voltage_tol(cpu_reg, volt_old, tol);
-		return ret;
+	pp = of_find_property(np, "cpu-supply", NULL);
+	if (pp) {
+		name = "cpu";
+		goto node_put;
 	}
 
-	/* scaling down?  scale voltage after frequency */
-	if (!IS_ERR(cpu_reg) && new_freq < old_freq) {
-		ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
-		if (ret) {
-			dev_err(cpu_dev, "failed to scale voltage down: %d\n",
-				ret);
-			clk_set_rate(cpu_clk, old_freq * 1000);
-		}
-	}
-
-	return ret;
+	dev_dbg(dev, "no regulator for cpu%d\n", cpu);
+node_put:
+	of_node_put(np);
+	return name;
 }
 
-static int allocate_resources(int cpu, struct device **cdev,
-			      struct regulator **creg, struct clk **cclk)
+static int resources_available(void)
 {
 	struct device *cpu_dev;
 	struct regulator *cpu_reg;
 	struct clk *cpu_clk;
 	int ret = 0;
-	char *reg_cpu0 = "cpu0", *reg_cpu = "cpu", *reg;
+	const char *name;
 
-	cpu_dev = get_cpu_device(cpu);
+	cpu_dev = get_cpu_device(0);
 	if (!cpu_dev) {
-		pr_err("failed to get cpu%d device\n", cpu);
+		pr_err("failed to get cpu0 device\n");
 		return -ENODEV;
 	}
 
-	/* Try "cpu0" for older DTs */
-	if (!cpu)
-		reg = reg_cpu0;
-	else
-		reg = reg_cpu;
-
-try_again:
-	cpu_reg = regulator_get_optional(cpu_dev, reg);
-	if (IS_ERR(cpu_reg)) {
+	cpu_clk = clk_get(cpu_dev, NULL);
+	ret = PTR_ERR_OR_ZERO(cpu_clk);
+	if (ret) {
 		/*
-		 * If cpu's regulator supply node is present, but regulator is
-		 * not yet registered, we should try defering probe.
+		 * If cpu's clk node is present, but clock is not yet
+		 * registered, we should try defering probe.
 		 */
-		if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
-			dev_dbg(cpu_dev, "cpu%d regulator not ready, retry\n",
-				cpu);
-			return -EPROBE_DEFER;
-		}
-
-		/* Try with "cpu-supply" */
-		if (reg == reg_cpu0) {
-			reg = reg_cpu;
-			goto try_again;
-		}
+		if (ret == -EPROBE_DEFER)
+			dev_dbg(cpu_dev, "clock not ready, retry\n");
+		else
+			dev_err(cpu_dev, "failed to get clock: %d\n", ret);
 
-		dev_dbg(cpu_dev, "no regulator for cpu%d: %ld\n",
-			cpu, PTR_ERR(cpu_reg));
+		return ret;
 	}
 
-	cpu_clk = clk_get(cpu_dev, NULL);
-	if (IS_ERR(cpu_clk)) {
-		/* put regulator */
-		if (!IS_ERR(cpu_reg))
-			regulator_put(cpu_reg);
+	clk_put(cpu_clk);
 
-		ret = PTR_ERR(cpu_clk);
+	name = find_supply_name(cpu_dev);
+	/* Platform doesn't require regulator */
+	if (!name)
+		return 0;
 
+	cpu_reg = regulator_get_optional(cpu_dev, name);
+	ret = PTR_ERR_OR_ZERO(cpu_reg);
+	if (ret) {
 		/*
-		 * If cpu's clk node is present, but clock is not yet
-		 * registered, we should try defering probe.
+		 * If cpu's regulator supply node is present, but regulator is
+		 * not yet registered, we should try defering probe.
 		 */
 		if (ret == -EPROBE_DEFER)
-			dev_dbg(cpu_dev, "cpu%d clock not ready, retry\n", cpu);
+			dev_dbg(cpu_dev, "cpu0 regulator not ready, retry\n");
 		else
-			dev_err(cpu_dev, "failed to get cpu%d clock: %d\n", cpu,
-				ret);
-	} else {
-		*cdev = cpu_dev;
-		*creg = cpu_reg;
-		*cclk = cpu_clk;
+			dev_dbg(cpu_dev, "no regulator for cpu0: %d\n", ret);
+
+		return ret;
 	}
 
-	return ret;
+	regulator_put(cpu_reg);
+	return 0;
 }
 
 static int cpufreq_init(struct cpufreq_policy *policy)
 {
-	struct cpufreq_dt_platform_data *pd;
 	struct cpufreq_frequency_table *freq_table;
-	struct device_node *np;
 	struct private_data *priv;
 	struct device *cpu_dev;
-	struct regulator *cpu_reg;
 	struct clk *cpu_clk;
-	unsigned long min_uV = ~0, max_uV = 0;
+	struct dev_pm_opp *suspend_opp;
 	unsigned int transition_latency;
+	bool opp_v1 = false;
+	const char *name;
 	int ret;
 
-	ret = allocate_resources(policy->cpu, &cpu_dev, &cpu_reg, &cpu_clk);
-	if (ret) {
-		pr_err("%s: Failed to allocate resources: %d\n", __func__, ret);
+	cpu_dev = get_cpu_device(policy->cpu);
+	if (!cpu_dev) {
+		pr_err("failed to get cpu%d device\n", policy->cpu);
+		return -ENODEV;
+	}
+
+	cpu_clk = clk_get(cpu_dev, NULL);
+	if (IS_ERR(cpu_clk)) {
+		ret = PTR_ERR(cpu_clk);
+		dev_err(cpu_dev, "%s: failed to get clk: %d\n", __func__, ret);
 		return ret;
 	}
 
-	np = of_node_get(cpu_dev->of_node);
-	if (!np) {
-		dev_err(cpu_dev, "failed to find cpu%d node\n", policy->cpu);
-		ret = -ENOENT;
-		goto out_put_reg_clk;
+	/* Get OPP-sharing information from "operating-points-v2" bindings */
+	ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, policy->cpus);
+	if (ret) {
+		/*
+		 * operating-points-v2 not supported, fallback to old method of
+		 * finding shared-OPPs for backward compatibility.
+		 */
+		if (ret == -ENOENT)
+			opp_v1 = true;
+		else
+			goto out_put_clk;
+	}
+
+	/*
+	 * OPP layer will be taking care of regulators now, but it needs to know
+	 * the name of the regulator first.
+	 */
+	name = find_supply_name(cpu_dev);
+	if (name) {
+		ret = dev_pm_opp_set_regulator(cpu_dev, name);
+		if (ret) {
+			dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
+				policy->cpu, ret);
+			goto out_put_clk;
+		}
 	}
 
-	/* OPPs might be populated at runtime, don't check for error here */
-	of_init_opp_table(cpu_dev);
+	/*
+	 * Initialize OPP tables for all policy->cpus. They will be shared by
+	 * all CPUs which have marked their CPUs shared with OPP bindings.
+	 *
+	 * For platforms not using operating-points-v2 bindings, we do this
+	 * before updating policy->cpus. Otherwise, we will end up creating
+	 * duplicate OPPs for policy->cpus.
+	 *
+	 * OPPs might be populated at runtime, don't check for error here
+	 */
+	dev_pm_opp_of_cpumask_add_table(policy->cpus);
 
 	/*
 	 * But we need OPP table to function so if it is not there let's
@@ -217,74 +212,51 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	 */
 	ret = dev_pm_opp_get_opp_count(cpu_dev);
 	if (ret <= 0) {
-		pr_debug("OPP table is not ready, deferring probe\n");
+		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
 		ret = -EPROBE_DEFER;
 		goto out_free_opp;
 	}
 
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (!priv) {
-		ret = -ENOMEM;
-		goto out_free_opp;
-	}
+	if (opp_v1) {
+		struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data();
 
-	of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
-
-	if (of_property_read_u32(np, "clock-latency", &transition_latency))
-		transition_latency = CPUFREQ_ETERNAL;
-
-	if (!IS_ERR(cpu_reg)) {
-		unsigned long opp_freq = 0;
+		if (!pd || !pd->independent_clocks)
+			cpumask_setall(policy->cpus);
 
 		/*
-		 * Disable any OPPs where the connected regulator isn't able to
-		 * provide the specified voltage and record minimum and maximum
-		 * voltage levels.
+		 * OPP tables are initialized only for policy->cpu, do it for
+		 * others as well.
 		 */
-		while (1) {
-			struct dev_pm_opp *opp;
-			unsigned long opp_uV, tol_uV;
-
-			rcu_read_lock();
-			opp = dev_pm_opp_find_freq_ceil(cpu_dev, &opp_freq);
-			if (IS_ERR(opp)) {
-				rcu_read_unlock();
-				break;
-			}
-			opp_uV = dev_pm_opp_get_voltage(opp);
-			rcu_read_unlock();
-
-			tol_uV = opp_uV * priv->voltage_tolerance / 100;
-			if (regulator_is_supported_voltage(cpu_reg,
-							   opp_uV - tol_uV,
-							   opp_uV + tol_uV)) {
-				if (opp_uV < min_uV)
-					min_uV = opp_uV;
-				if (opp_uV > max_uV)
-					max_uV = opp_uV;
-			} else {
-				dev_pm_opp_disable(cpu_dev, opp_freq);
-			}
-
-			opp_freq++;
-		}
+		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
+		if (ret)
+			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
+				__func__, ret);
+	}
 
-		ret = regulator_set_voltage_time(cpu_reg, min_uV, max_uV);
-		if (ret > 0)
-			transition_latency += ret * 1000;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto out_free_opp;
 	}
 
+	priv->reg_name = name;
+
 	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
 	if (ret) {
-		pr_err("failed to init cpufreq table: %d\n", ret);
+		dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
 		goto out_free_priv;
 	}
 
 	priv->cpu_dev = cpu_dev;
-	priv->cpu_reg = cpu_reg;
 	policy->driver_data = priv;
-
 	policy->clk = cpu_clk;
+
+	rcu_read_lock();
+	suspend_opp = dev_pm_opp_get_suspend_opp(cpu_dev);
+	if (suspend_opp)
+		policy->suspend_freq = dev_pm_opp_get_freq(suspend_opp) / 1000;
+	rcu_read_unlock();
+
 	ret = cpufreq_table_validate_and_show(policy, freq_table);
 	if (ret) {
 		dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__,
@@ -292,13 +264,20 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		goto out_free_cpufreq_table;
 	}
 
-	policy->cpuinfo.transition_latency = transition_latency;
+	/* Support turbo/boost mode */
+	if (policy_has_boost_freq(policy)) {
+		/* This gets disabled by core on driver unregister */
+		ret = cpufreq_enable_boost_support();
+		if (ret)
+			goto out_free_cpufreq_table;
+		cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs;
+	}
 
-	pd = cpufreq_get_driver_data();
-	if (!pd || !pd->independent_clocks)
-		cpumask_setall(policy->cpus);
+	transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev);
+	if (!transition_latency)
+		transition_latency = CPUFREQ_ETERNAL;
 
-	of_node_put(np);
+	policy->cpuinfo.transition_latency = transition_latency;
 
 	return 0;
 
@@ -307,12 +286,11 @@ out_free_cpufreq_table:
 out_free_priv:
 	kfree(priv);
 out_free_opp:
-	of_free_opp_table(cpu_dev);
-	of_node_put(np);
-out_put_reg_clk:
+	dev_pm_opp_of_cpumask_remove_table(policy->cpus);
+	if (name)
+		dev_pm_opp_put_regulator(cpu_dev);
+out_put_clk:
 	clk_put(cpu_clk);
-	if (!IS_ERR(cpu_reg))
-		regulator_put(cpu_reg);
 
 	return ret;
 }
@@ -323,10 +301,11 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 
 	cpufreq_cooling_unregister(priv->cdev);
 	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
-	of_free_opp_table(priv->cpu_dev);
+	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
+	if (priv->reg_name)
+		dev_pm_opp_put_regulator(priv->cpu_dev);
+
 	clk_put(policy->clk);
-	if (!IS_ERR(priv->cpu_reg))
-		regulator_put(priv->cpu_reg);
 	kfree(priv);
 
 	return 0;
@@ -368,14 +347,12 @@ static struct cpufreq_driver dt_cpufreq_driver = {
 	.exit = cpufreq_exit,
 	.ready = cpufreq_ready,
 	.name = "cpufreq-dt",
-	.attr = cpufreq_generic_attr,
+	.attr = cpufreq_dt_attr,
+	.suspend = cpufreq_generic_suspend,
 };
 
 static int dt_cpufreq_probe(struct platform_device *pdev)
 {
-	struct device *cpu_dev;
-	struct regulator *cpu_reg;
-	struct clk *cpu_clk;
 	int ret;
 
 	/*
@@ -385,19 +362,15 @@ static int dt_cpufreq_probe(struct platform_device *pdev)
 	 *
 	 * FIXME: Is checking this only for CPU0 sufficient ?
 	 */
-	ret = allocate_resources(0, &cpu_dev, &cpu_reg, &cpu_clk);
+	ret = resources_available();
 	if (ret)
 		return ret;
 
-	clk_put(cpu_clk);
-	if (!IS_ERR(cpu_reg))
-		regulator_put(cpu_reg);
-
 	dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev);
 
 	ret = cpufreq_register_driver(&dt_cpufreq_driver);
 	if (ret)
-		dev_err(cpu_dev, "failed register driver: %d\n", ret);
+		dev_err(&pdev->dev, "failed register driver: %d\n", ret);
 
 	return ret;
 }
@@ -417,6 +390,7 @@ static struct platform_driver dt_cpufreq_platdrv = {
 };
 module_platform_driver(dt_cpufreq_platdrv);
 
+MODULE_ALIAS("platform:cpufreq-dt");
 MODULE_AUTHOR("Viresh Kumar <viresh.kumar@linaro.org>");
 MODULE_AUTHOR("Shawn Guo <shawn.guo@linaro.org>");
 MODULE_DESCRIPTION("Generic cpufreq driver");
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 8ae655c364f4..aeab818f24b3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2388,6 +2388,49 @@ int cpufreq_boost_supported(void)
 }
 EXPORT_SYMBOL_GPL(cpufreq_boost_supported);
 
+static int create_boost_sysfs_file(void)
+{
+	int ret;
+
+	if (!cpufreq_boost_supported())
+		return 0;
+
+	/*
+	 * Check if driver provides function to enable boost -
+	 * if not, use cpufreq_boost_set_sw as default
+	 */
+	if (!cpufreq_driver->set_boost)
+		cpufreq_driver->set_boost = cpufreq_boost_set_sw;
+
+	ret = cpufreq_sysfs_create_file(&boost.attr);
+	if (ret)
+		pr_err("%s: cannot register global BOOST sysfs file\n",
+		       __func__);
+
+	return ret;
+}
+
+static void remove_boost_sysfs_file(void)
+{
+	if (cpufreq_boost_supported())
+		cpufreq_sysfs_remove_file(&boost.attr);
+}
+
+int cpufreq_enable_boost_support(void)
+{
+	if (!cpufreq_driver)
+		return -EINVAL;
+
+	if (cpufreq_boost_supported())
+		return 0;
+
+	cpufreq_driver->boost_supported = true;
+
+	/* This will get removed on driver unregister */
+	return create_boost_sysfs_file();
+}
+EXPORT_SYMBOL_GPL(cpufreq_enable_boost_support);
+
 int cpufreq_boost_enabled(void)
 {
 	return cpufreq_driver->boost_enabled;
@@ -2437,21 +2480,9 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	if (driver_data->setpolicy)
 		driver_data->flags |= CPUFREQ_CONST_LOOPS;
 
-	if (cpufreq_boost_supported()) {
-		/*
-		 * Check if driver provides function to enable boost -
-		 * if not, use cpufreq_boost_set_sw as default
-		 */
-		if (!cpufreq_driver->set_boost)
-			cpufreq_driver->set_boost = cpufreq_boost_set_sw;
-
-		ret = cpufreq_sysfs_create_file(&boost.attr);
-		if (ret) {
-			pr_err("%s: cannot register global BOOST sysfs file\n",
-			       __func__);
-			goto err_null_driver;
-		}
-	}
+	ret = create_boost_sysfs_file();
+	if (ret)
+		goto err_null_driver;
 
 	ret = subsys_interface_register(&cpufreq_interface);
 	if (ret)
@@ -2472,8 +2503,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 err_if_unreg:
 	subsys_interface_unregister(&cpufreq_interface);
 err_boost_unreg:
-	if (cpufreq_boost_supported())
-		cpufreq_sysfs_remove_file(&boost.attr);
+	remove_boost_sysfs_file();
 err_null_driver:
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
@@ -2500,9 +2530,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 	pr_debug("unregistering driver %s\n", driver->name);
 
 	subsys_interface_unregister(&cpufreq_interface);
-	if (cpufreq_boost_supported())
-		cpufreq_sysfs_remove_file(&boost.attr);
-
+	remove_boost_sysfs_file();
 	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
 	down_write(&cpufreq_rwsem);
diff --git a/drivers/cpufreq/cpufreq_opp.c b/drivers/cpufreq/cpufreq_opp.c
deleted file mode 100644
index 773bcde893c0..000000000000
--- a/drivers/cpufreq/cpufreq_opp.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Generic OPP helper interface for CPUFreq drivers
- *
- * Copyright (C) 2009-2014 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/cpufreq.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/pm_opp.h>
-#include <linux/rcupdate.h>
-#include <linux/slab.h>
-
-/**
- * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device
- * @dev:	device for which we do this operation
- * @table:	Cpufreq table returned back to caller
- *
- * Generate a cpufreq table for a provided device- this assumes that the
- * opp list is already initialized and ready for usage.
- *
- * This function allocates required memory for the cpufreq table. It is
- * expected that the caller does the required maintenance such as freeing
- * the table as required.
- *
- * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM
- * if no memory available for the operation (table is not populated), returns 0
- * if successful and table is populated.
- *
- * WARNING: It is  important for the callers to ensure refreshing their copy of
- * the table if any of the mentioned functions have been invoked in the interim.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Since we just use the regular accessor functions to access the internal data
- * structures, we use RCU read lock inside this function. As a result, users of
- * this function DONOT need to use explicit locks for invoking.
- */
-int dev_pm_opp_init_cpufreq_table(struct device *dev,
-				  struct cpufreq_frequency_table **table)
-{
-	struct dev_pm_opp *opp;
-	struct cpufreq_frequency_table *freq_table = NULL;
-	int i, max_opps, ret = 0;
-	unsigned long rate;
-
-	rcu_read_lock();
-
-	max_opps = dev_pm_opp_get_opp_count(dev);
-	if (max_opps <= 0) {
-		ret = max_opps ? max_opps : -ENODATA;
-		goto out;
-	}
-
-	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
-	if (!freq_table) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	for (i = 0, rate = 0; i < max_opps; i++, rate++) {
-		/* find next rate */
-		opp = dev_pm_opp_find_freq_ceil(dev, &rate);
-		if (IS_ERR(opp)) {
-			ret = PTR_ERR(opp);
-			goto out;
-		}
-		freq_table[i].driver_data = i;
-		freq_table[i].frequency = rate / 1000;
-	}
-
-	freq_table[i].driver_data = i;
-	freq_table[i].frequency = CPUFREQ_TABLE_END;
-
-	*table = &freq_table[0];
-
-out:
-	rcu_read_unlock();
-	if (ret)
-		kfree(freq_table);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table);
-
-/**
- * dev_pm_opp_free_cpufreq_table() - free the cpufreq table
- * @dev:	device for which we do this operation
- * @table:	table to free
- *
- * Free up the table allocated by dev_pm_opp_init_cpufreq_table
- */
-void dev_pm_opp_free_cpufreq_table(struct device *dev,
-				   struct cpufreq_frequency_table **table)
-{
-	if (!table)
-		return;
-
-	kfree(*table);
-	*table = NULL;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
diff --git a/drivers/cpufreq/exynos5440-cpufreq.c b/drivers/cpufreq/exynos5440-cpufreq.c
index 21a90ed7f3d8..c0f3373706f4 100644
--- a/drivers/cpufreq/exynos5440-cpufreq.c
+++ b/drivers/cpufreq/exynos5440-cpufreq.c
@@ -360,7 +360,7 @@ static int exynos_cpufreq_probe(struct platform_device *pdev)
 		goto err_put_node;
 	}
 
-	ret = of_init_opp_table(dvfs_info->dev);
+	ret = dev_pm_opp_of_add_table(dvfs_info->dev);
 	if (ret) {
 		dev_err(dvfs_info->dev, "failed to init OPP table: %d\n", ret);
 		goto err_put_node;
@@ -424,7 +424,7 @@ static int exynos_cpufreq_probe(struct platform_device *pdev)
 err_free_table:
 	dev_pm_opp_free_cpufreq_table(dvfs_info->dev, &dvfs_info->freq_table);
 err_free_opp:
-	of_free_opp_table(dvfs_info->dev);
+	dev_pm_opp_of_remove_table(dvfs_info->dev);
 err_put_node:
 	of_node_put(np);
 	dev_err(&pdev->dev, "%s: failed initialization\n", __func__);
@@ -435,7 +435,7 @@ static int exynos_cpufreq_remove(struct platform_device *pdev)
 {
 	cpufreq_unregister_driver(&exynos_driver);
 	dev_pm_opp_free_cpufreq_table(dvfs_info->dev, &dvfs_info->freq_table);
-	of_free_opp_table(dvfs_info->dev);
+	dev_pm_opp_of_remove_table(dvfs_info->dev);
 	return 0;
 }
 
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index df14766a8e06..d1a0cbfdba63 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -18,6 +18,21 @@
  *                     FREQUENCY TABLE HELPERS                       *
  *********************************************************************/
 
+bool policy_has_boost_freq(struct cpufreq_policy *policy)
+{
+	struct cpufreq_frequency_table *pos, *table = policy->freq_table;
+
+	if (!table)
+		return false;
+
+	cpufreq_for_each_valid_entry(pos, table)
+		if (pos->flags & CPUFREQ_BOOST_FREQ)
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(policy_has_boost_freq);
+
 int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 				    struct cpufreq_frequency_table *table)
 {
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 380a90d3c57e..84fbc8e8fc56 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -202,7 +202,7 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev)
 	 */
 	num = dev_pm_opp_get_opp_count(cpu_dev);
 	if (num < 0) {
-		ret = of_init_opp_table(cpu_dev);
+		ret = dev_pm_opp_of_add_table(cpu_dev);
 		if (ret < 0) {
 			dev_err(cpu_dev, "failed to init OPP table: %d\n", ret);
 			goto put_reg;
@@ -312,7 +312,7 @@ free_freq_table:
 	dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
 out_free_opp:
 	if (free_opp)
-		of_free_opp_table(cpu_dev);
+		dev_pm_opp_of_remove_table(cpu_dev);
 put_reg:
 	if (!IS_ERR(arm_reg))
 		regulator_put(arm_reg);
@@ -340,7 +340,7 @@ static int imx6q_cpufreq_remove(struct platform_device *pdev)
 	cpufreq_unregister_driver(&imx6q_cpufreq_driver);
 	dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
 	if (free_opp)
-		of_free_opp_table(cpu_dev);
+		dev_pm_opp_of_remove_table(cpu_dev);
 	regulator_put(arm_reg);
 	if (!IS_ERR(pu_reg))
 		regulator_put(pu_reg);
diff --git a/drivers/cpuidle/cpuidle-calxeda.c b/drivers/cpuidle/cpuidle-calxeda.c
index 9445e6cc02be..1c3242c3e3a7 100644
--- a/drivers/cpuidle/cpuidle-calxeda.c
+++ b/drivers/cpuidle/cpuidle-calxeda.c
@@ -25,16 +25,21 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/platform_device.h>
+#include <linux/psci.h>
+
 #include <asm/cpuidle.h>
 #include <asm/suspend.h>
-#include <asm/psci.h>
+
+#include <uapi/linux/psci.h>
+
+#define CALXEDA_IDLE_PARAM \
+	((0 << PSCI_0_2_POWER_STATE_ID_SHIFT) | \
+	 (0 << PSCI_0_2_POWER_STATE_AFFL_SHIFT) | \
+	 (PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT))
 
 static int calxeda_idle_finish(unsigned long val)
 {
-	const struct psci_power_state ps = {
-		.type = PSCI_POWER_STATE_TYPE_POWER_DOWN,
-	};
-	return psci_ops.cpu_suspend(ps, __pa(cpu_resume));
+	return psci_ops.cpu_suspend(CALXEDA_IDLE_PARAM, __pa(cpu_resume));
 }
 
 static int calxeda_pwrdown_idle(struct cpuidle_device *dev,
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index ca1b362d77e2..33c753b21d14 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -177,10 +177,10 @@ int update_devfreq(struct devfreq *devfreq)
 		return err;
 
 	/*
-	 * Adjust the freuqency with user freq and QoS.
+	 * Adjust the frequency with user freq and QoS.
 	 *
-	 * List from the highest proiority
-	 * max_freq (probably called by thermal when it's too hot)
+	 * List from the highest priority
+	 * max_freq
 	 * min_freq
 	 */
 
@@ -482,7 +482,7 @@ struct devfreq *devfreq_add_device(struct device *dev,
 						devfreq->profile->max_state *
 						devfreq->profile->max_state,
 						GFP_KERNEL);
-	devfreq->time_in_state = devm_kzalloc(dev, sizeof(unsigned int) *
+	devfreq->time_in_state = devm_kzalloc(dev, sizeof(unsigned long) *
 						devfreq->profile->max_state,
 						GFP_KERNEL);
 	devfreq->last_stat_updated = jiffies;
diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c
index 0720ba84ca92..ae72ba5e78df 100644
--- a/drivers/devfreq/governor_simpleondemand.c
+++ b/drivers/devfreq/governor_simpleondemand.c
@@ -21,17 +21,20 @@
 static int devfreq_simple_ondemand_func(struct devfreq *df,
 					unsigned long *freq)
 {
-	struct devfreq_dev_status stat;
-	int err = df->profile->get_dev_status(df->dev.parent, &stat);
+	int err;
+	struct devfreq_dev_status *stat;
 	unsigned long long a, b;
 	unsigned int dfso_upthreshold = DFSO_UPTHRESHOLD;
 	unsigned int dfso_downdifferential = DFSO_DOWNDIFFERENCTIAL;
 	struct devfreq_simple_ondemand_data *data = df->data;
 	unsigned long max = (df->max_freq) ? df->max_freq : UINT_MAX;
 
+	err = devfreq_update_stats(df);
 	if (err)
 		return err;
 
+	stat = &df->last_status;
+
 	if (data) {
 		if (data->upthreshold)
 			dfso_upthreshold = data->upthreshold;
@@ -43,41 +46,41 @@ static int devfreq_simple_ondemand_func(struct devfreq *df,
 		return -EINVAL;
 
 	/* Assume MAX if it is going to be divided by zero */
-	if (stat.total_time == 0) {
+	if (stat->total_time == 0) {
 		*freq = max;
 		return 0;
 	}
 
 	/* Prevent overflow */
-	if (stat.busy_time >= (1 << 24) || stat.total_time >= (1 << 24)) {
-		stat.busy_time >>= 7;
-		stat.total_time >>= 7;
+	if (stat->busy_time >= (1 << 24) || stat->total_time >= (1 << 24)) {
+		stat->busy_time >>= 7;
+		stat->total_time >>= 7;
 	}
 
 	/* Set MAX if it's busy enough */
-	if (stat.busy_time * 100 >
-	    stat.total_time * dfso_upthreshold) {
+	if (stat->busy_time * 100 >
+	    stat->total_time * dfso_upthreshold) {
 		*freq = max;
 		return 0;
 	}
 
 	/* Set MAX if we do not know the initial frequency */
-	if (stat.current_frequency == 0) {
+	if (stat->current_frequency == 0) {
 		*freq = max;
 		return 0;
 	}
 
 	/* Keep the current frequency */
-	if (stat.busy_time * 100 >
-	    stat.total_time * (dfso_upthreshold - dfso_downdifferential)) {
-		*freq = stat.current_frequency;
+	if (stat->busy_time * 100 >
+	    stat->total_time * (dfso_upthreshold - dfso_downdifferential)) {
+		*freq = stat->current_frequency;
 		return 0;
 	}
 
 	/* Set the desired frequency based on the load */
-	a = stat.busy_time;
-	a *= stat.current_frequency;
-	b = div_u64(a, stat.total_time);
+	a = stat->busy_time;
+	a *= stat->current_frequency;
+	b = div_u64(a, stat->total_time);
 	b *= 100;
 	b = div_u64(b, (dfso_upthreshold - dfso_downdifferential / 2));
 	*freq = (unsigned long) b;
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 6517132e5d8b..a0d24f22b050 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -5,6 +5,9 @@
 
 menu "Firmware Drivers"
 
+config ARM_PSCI_FW
+	bool
+
 config EDD
 	tristate "BIOS Enhanced Disk Drive calls determine boot disk"
 	depends on X86
@@ -136,6 +139,9 @@ config QCOM_SCM
 	bool
 	depends on ARM || ARM64
 
+config HAVE_ARM_SMCCC
+	bool
+
 source "drivers/firmware/google/Kconfig"
 source "drivers/firmware/efi/Kconfig"
 
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 3fdd3912709a..6b051293e689 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -1,6 +1,7 @@
 #
 # Makefile for the linux kernel.
 #
+obj-$(CONFIG_ARM_PSCI_FW)	+= psci.o
 obj-$(CONFIG_DMI)		+= dmi_scan.o
 obj-$(CONFIG_DMI_SYSFS)		+= dmi-sysfs.o
 obj-$(CONFIG_EDD)		+= edd.o
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index d8be608a9f3b..513ae7ce1449 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -1,6 +1,14 @@
 #
 # Makefile for linux kernel
 #
+
+#
+# ARM64 maps efi runtime services in userspace addresses
+# which don't have KASAN shadow. So dereference of these addresses
+# in efi_call_virt() will cause crash if this code instrumented.
+#
+KASAN_SANITIZE_runtime-wrappers.o	:= n
+
 obj-$(CONFIG_EFI)			+= efi.o vars.o reboot.o
 obj-$(CONFIG_EFI_VARS)			+= efivars.o
 obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 280bc0a63365..c0ac48c5d510 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -14,6 +14,8 @@ cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS))
 cflags-$(CONFIG_ARM)		:= $(subst -pg,,$(KBUILD_CFLAGS)) \
 				   -fno-builtin -fpic -mno-single-pic-base
 
+cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
+
 KBUILD_CFLAGS			:= $(cflags-y) \
 				   $(call cc-option,-ffreestanding) \
 				   $(call cc-option,-fno-stack-protector)
@@ -22,7 +24,15 @@ GCOV_PROFILE			:= n
 KASAN_SANITIZE			:= n
 
 lib-y				:= efi-stub-helper.o
-lib-$(CONFIG_EFI_ARMSTUB)	+= arm-stub.o fdt.o
+
+# include the stub's generic dependencies from lib/ when building for ARM/arm64
+arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c
+
+$(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
+	$(call if_changed_rule,cc_o_c)
+
+lib-$(CONFIG_EFI_ARMSTUB)	+= arm-stub.o fdt.o string.o \
+				   $(patsubst %.c,lib-%.o,$(arm-deps))
 
 CFLAGS_fdt.o			+= -I$(srctree)/scripts/dtc/libfdt/
 
@@ -32,10 +42,27 @@ CFLAGS_fdt.o			+= -I$(srctree)/scripts/dtc/libfdt/
 # So let's apply the __init annotations at the section level, by prefixing
 # the section names directly. This will ensure that even all the inline string
 # literals are covered.
+# The fact that the stub and the kernel proper are essentially the same binary
+# also means that we need to be extra careful to make sure that the stub does
+# not rely on any absolute symbol references, considering that the virtual
+# kernel mapping that the linker uses is not active yet when the stub is
+# executing. So build all C dependencies of the EFI stub into libstub, and do
+# a verification pass to see if any absolute relocations exist in any of the
+# object files.
 #
-extra-$(CONFIG_ARM64)		:= $(lib-y)
-lib-$(CONFIG_ARM64)		:= $(patsubst %.o,%.init.o,$(lib-y))
+extra-$(CONFIG_EFI_ARMSTUB)	:= $(lib-y)
+lib-$(CONFIG_EFI_ARMSTUB)	:= $(patsubst %.o,%.stub.o,$(lib-y))
+
+STUBCOPY_FLAGS-y		:= -R .debug* -R *ksymtab* -R *kcrctab*
+STUBCOPY_FLAGS-$(CONFIG_ARM64)	+= --prefix-alloc-sections=.init \
+				   --prefix-symbols=__efistub_
+STUBCOPY_RELOC-$(CONFIG_ARM64)	:= R_AARCH64_ABS
+
+$(obj)/%.stub.o: $(obj)/%.o FORCE
+	$(call if_changed,stubcopy)
 
-OBJCOPYFLAGS := --prefix-alloc-sections=.init
-$(obj)/%.init.o: $(obj)/%.o FORCE
-	$(call if_changed,objcopy)
+quiet_cmd_stubcopy = STUBCPY $@
+      cmd_stubcopy = if $(OBJCOPY) $(STUBCOPY_FLAGS-y) $< $@; then	\
+		     $(OBJDUMP) -r $@ | grep $(STUBCOPY_RELOC-y)	\
+		     && (echo >&2 "$@: absolute symbol references not allowed in the EFI stub"; \
+			 rm -f $@; /bin/false); else /bin/false; fi
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index e334a01cf92f..6b6548fda089 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -5,10 +5,6 @@
 /* error code which can't be mistaken for valid address */
 #define EFI_ERROR	(~0UL)
 
-#undef memcpy
-#undef memset
-#undef memmove
-
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c
index ef5d764e2a27..b62e2f5dcab3 100644
--- a/drivers/firmware/efi/libstub/fdt.c
+++ b/drivers/firmware/efi/libstub/fdt.c
@@ -147,15 +147,6 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt,
 	if (status)
 		goto fdt_set_fail;
 
-	/*
-	 * Add kernel version banner so stub/kernel match can be
-	 * verified.
-	 */
-	status = fdt_setprop_string(fdt, node, "linux,uefi-stub-kern-ver",
-			     linux_banner);
-	if (status)
-		goto fdt_set_fail;
-
 	return EFI_SUCCESS;
 
 fdt_set_fail:
diff --git a/drivers/firmware/efi/libstub/string.c b/drivers/firmware/efi/libstub/string.c
new file mode 100644
index 000000000000..09d5a0894343
--- /dev/null
+++ b/drivers/firmware/efi/libstub/string.c
@@ -0,0 +1,57 @@
+/*
+ * Taken from:
+ *  linux/lib/string.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+
+#ifndef __HAVE_ARCH_STRSTR
+/**
+ * strstr - Find the first substring in a %NUL terminated string
+ * @s1: The string to be searched
+ * @s2: The string to search for
+ */
+char *strstr(const char *s1, const char *s2)
+{
+	size_t l1, l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	l1 = strlen(s1);
+	while (l1 >= l2) {
+		l1--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+#endif
+
+#ifndef __HAVE_ARCH_STRNCMP
+/**
+ * strncmp - Compare two length-limited strings
+ * @cs: One string
+ * @ct: Another string
+ * @count: The maximum number of bytes to compare
+ */
+int strncmp(const char *cs, const char *ct, size_t count)
+{
+	unsigned char c1, c2;
+
+	while (count) {
+		c1 = *cs++;
+		c2 = *ct++;
+		if (c1 != c2)
+			return c1 < c2 ? -1 : 1;
+		if (!c1)
+			break;
+		count--;
+	}
+	return 0;
+}
+#endif
diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c
new file mode 100644
index 000000000000..0ab477ba6564
--- /dev/null
+++ b/drivers/firmware/psci.c
@@ -0,0 +1,609 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 ARM Limited
+ */
+
+#define pr_fmt(fmt) "psci: " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/cpuidle.h>
+#include <linux/errno.h>
+#include <linux/linkage.h>
+#include <linux/of.h>
+#include <linux/pm.h>
+#include <linux/printk.h>
+#include <linux/psci.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+
+#include <uapi/linux/psci.h>
+
+#include <asm/cpuidle.h>
+#include <asm/cputype.h>
+#include <asm/system_misc.h>
+#include <asm/smp_plat.h>
+#include <asm/suspend.h>
+
+/*
+ * While a 64-bit OS can make calls with SMC32 calling conventions, for some
+ * calls it is necessary to use SMC64 to pass or return 64-bit values.
+ * For such calls PSCI_FN_NATIVE(version, name) will choose the appropriate
+ * (native-width) function ID.
+ */
+#ifdef CONFIG_64BIT
+#define PSCI_FN_NATIVE(version, name)	PSCI_##version##_FN64_##name
+#else
+#define PSCI_FN_NATIVE(version, name)	PSCI_##version##_FN_##name
+#endif
+
+/*
+ * The CPU any Trusted OS is resident on. The trusted OS may reject CPU_OFF
+ * calls to its resident CPU, so we must avoid issuing those. We never migrate
+ * a Trusted OS even if it claims to be capable of migration -- doing so will
+ * require cooperation with a Trusted OS driver.
+ */
+static int resident_cpu = -1;
+
+bool psci_tos_resident_on(int cpu)
+{
+	return cpu == resident_cpu;
+}
+
+struct psci_operations psci_ops;
+
+typedef unsigned long (psci_fn)(unsigned long, unsigned long,
+				unsigned long, unsigned long);
+static psci_fn *invoke_psci_fn;
+
+enum psci_function {
+	PSCI_FN_CPU_SUSPEND,
+	PSCI_FN_CPU_ON,
+	PSCI_FN_CPU_OFF,
+	PSCI_FN_MIGRATE,
+	PSCI_FN_MAX,
+};
+
+static u32 psci_function_id[PSCI_FN_MAX];
+
+#define PSCI_0_2_POWER_STATE_MASK		\
+				(PSCI_0_2_POWER_STATE_ID_MASK | \
+				PSCI_0_2_POWER_STATE_TYPE_MASK | \
+				PSCI_0_2_POWER_STATE_AFFL_MASK)
+
+#define PSCI_1_0_EXT_POWER_STATE_MASK		\
+				(PSCI_1_0_EXT_POWER_STATE_ID_MASK | \
+				PSCI_1_0_EXT_POWER_STATE_TYPE_MASK)
+
+static u32 psci_cpu_suspend_feature;
+
+static inline bool psci_has_ext_power_state(void)
+{
+	return psci_cpu_suspend_feature &
+				PSCI_1_0_FEATURES_CPU_SUSPEND_PF_MASK;
+}
+
+bool psci_power_state_loses_context(u32 state)
+{
+	const u32 mask = psci_has_ext_power_state() ?
+					PSCI_1_0_EXT_POWER_STATE_TYPE_MASK :
+					PSCI_0_2_POWER_STATE_TYPE_MASK;
+
+	return state & mask;
+}
+
+bool psci_power_state_is_valid(u32 state)
+{
+	const u32 valid_mask = psci_has_ext_power_state() ?
+			       PSCI_1_0_EXT_POWER_STATE_MASK :
+			       PSCI_0_2_POWER_STATE_MASK;
+
+	return !(state & ~valid_mask);
+}
+
+static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
+			unsigned long arg0, unsigned long arg1,
+			unsigned long arg2)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_hvc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);
+	return res.a0;
+}
+
+static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
+			unsigned long arg0, unsigned long arg1,
+			unsigned long arg2)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_smc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);
+	return res.a0;
+}
+
+static int psci_to_linux_errno(int errno)
+{
+	switch (errno) {
+	case PSCI_RET_SUCCESS:
+		return 0;
+	case PSCI_RET_NOT_SUPPORTED:
+		return -EOPNOTSUPP;
+	case PSCI_RET_INVALID_PARAMS:
+	case PSCI_RET_INVALID_ADDRESS:
+		return -EINVAL;
+	case PSCI_RET_DENIED:
+		return -EPERM;
+	};
+
+	return -EINVAL;
+}
+
+static u32 psci_get_version(void)
+{
+	return invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
+}
+
+static int psci_cpu_suspend(u32 state, unsigned long entry_point)
+{
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_CPU_SUSPEND];
+	err = invoke_psci_fn(fn, state, entry_point, 0);
+	return psci_to_linux_errno(err);
+}
+
+static int psci_cpu_off(u32 state)
+{
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_CPU_OFF];
+	err = invoke_psci_fn(fn, state, 0, 0);
+	return psci_to_linux_errno(err);
+}
+
+static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point)
+{
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_CPU_ON];
+	err = invoke_psci_fn(fn, cpuid, entry_point, 0);
+	return psci_to_linux_errno(err);
+}
+
+static int psci_migrate(unsigned long cpuid)
+{
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_MIGRATE];
+	err = invoke_psci_fn(fn, cpuid, 0, 0);
+	return psci_to_linux_errno(err);
+}
+
+static int psci_affinity_info(unsigned long target_affinity,
+		unsigned long lowest_affinity_level)
+{
+	return invoke_psci_fn(PSCI_FN_NATIVE(0_2, AFFINITY_INFO),
+			      target_affinity, lowest_affinity_level, 0);
+}
+
+static int psci_migrate_info_type(void)
+{
+	return invoke_psci_fn(PSCI_0_2_FN_MIGRATE_INFO_TYPE, 0, 0, 0);
+}
+
+static unsigned long psci_migrate_info_up_cpu(void)
+{
+	return invoke_psci_fn(PSCI_FN_NATIVE(0_2, MIGRATE_INFO_UP_CPU),
+			      0, 0, 0);
+}
+
+static int get_set_conduit_method(struct device_node *np)
+{
+	const char *method;
+
+	pr_info("probing for conduit method from DT.\n");
+
+	if (of_property_read_string(np, "method", &method)) {
+		pr_warn("missing \"method\" property\n");
+		return -ENXIO;
+	}
+
+	if (!strcmp("hvc", method)) {
+		invoke_psci_fn = __invoke_psci_fn_hvc;
+	} else if (!strcmp("smc", method)) {
+		invoke_psci_fn = __invoke_psci_fn_smc;
+	} else {
+		pr_warn("invalid \"method\" property: %s\n", method);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
+{
+	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
+}
+
+static void psci_sys_poweroff(void)
+{
+	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
+}
+
+static int __init psci_features(u32 psci_func_id)
+{
+	return invoke_psci_fn(PSCI_1_0_FN_PSCI_FEATURES,
+			      psci_func_id, 0, 0);
+}
+
+#ifdef CONFIG_CPU_IDLE
+static DEFINE_PER_CPU_READ_MOSTLY(u32 *, psci_power_state);
+
+static int psci_dt_cpu_init_idle(struct device_node *cpu_node, int cpu)
+{
+	int i, ret, count = 0;
+	u32 *psci_states;
+	struct device_node *state_node;
+
+	/*
+	 * If the PSCI cpu_suspend function hook has not been initialized
+	 * idle states must not be enabled, so bail out
+	 */
+	if (!psci_ops.cpu_suspend)
+		return -EOPNOTSUPP;
+
+	/* Count idle states */
+	while ((state_node = of_parse_phandle(cpu_node, "cpu-idle-states",
+					      count))) {
+		count++;
+		of_node_put(state_node);
+	}
+
+	if (!count)
+		return -ENODEV;
+
+	psci_states = kcalloc(count, sizeof(*psci_states), GFP_KERNEL);
+	if (!psci_states)
+		return -ENOMEM;
+
+	for (i = 0; i < count; i++) {
+		u32 state;
+
+		state_node = of_parse_phandle(cpu_node, "cpu-idle-states", i);
+
+		ret = of_property_read_u32(state_node,
+					   "arm,psci-suspend-param",
+					   &state);
+		if (ret) {
+			pr_warn(" * %s missing arm,psci-suspend-param property\n",
+				state_node->full_name);
+			of_node_put(state_node);
+			goto free_mem;
+		}
+
+		of_node_put(state_node);
+		pr_debug("psci-power-state %#x index %d\n", state, i);
+		if (!psci_power_state_is_valid(state)) {
+			pr_warn("Invalid PSCI power state %#x\n", state);
+			ret = -EINVAL;
+			goto free_mem;
+		}
+		psci_states[i] = state;
+	}
+	/* Idle states parsed correctly, initialize per-cpu pointer */
+	per_cpu(psci_power_state, cpu) = psci_states;
+	return 0;
+
+free_mem:
+	kfree(psci_states);
+	return ret;
+}
+
+int psci_cpu_init_idle(unsigned int cpu)
+{
+	struct device_node *cpu_node;
+	int ret;
+
+	cpu_node = of_get_cpu_node(cpu, NULL);
+	if (!cpu_node)
+		return -ENODEV;
+
+	ret = psci_dt_cpu_init_idle(cpu_node, cpu);
+
+	of_node_put(cpu_node);
+
+	return ret;
+}
+
+static int psci_suspend_finisher(unsigned long index)
+{
+	u32 *state = __this_cpu_read(psci_power_state);
+
+	return psci_ops.cpu_suspend(state[index - 1],
+				    virt_to_phys(cpu_resume));
+}
+
+int psci_cpu_suspend_enter(unsigned long index)
+{
+	int ret;
+	u32 *state = __this_cpu_read(psci_power_state);
+	/*
+	 * idle state index 0 corresponds to wfi, should never be called
+	 * from the cpu_suspend operations
+	 */
+	if (WARN_ON_ONCE(!index))
+		return -EINVAL;
+
+	if (!psci_power_state_loses_context(state[index - 1]))
+		ret = psci_ops.cpu_suspend(state[index - 1], 0);
+	else
+		ret = cpu_suspend(index, psci_suspend_finisher);
+
+	return ret;
+}
+
+/* ARM specific CPU idle operations */
+#ifdef CONFIG_ARM
+static struct cpuidle_ops psci_cpuidle_ops __initdata = {
+	.suspend = psci_cpu_suspend_enter,
+	.init = psci_dt_cpu_init_idle,
+};
+
+CPUIDLE_METHOD_OF_DECLARE(psci, "arm,psci", &psci_cpuidle_ops);
+#endif
+#endif
+
+static int psci_system_suspend(unsigned long unused)
+{
+	return invoke_psci_fn(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND),
+			      virt_to_phys(cpu_resume), 0, 0);
+}
+
+static int psci_system_suspend_enter(suspend_state_t state)
+{
+	return cpu_suspend(0, psci_system_suspend);
+}
+
+static const struct platform_suspend_ops psci_suspend_ops = {
+	.valid          = suspend_valid_only_mem,
+	.enter          = psci_system_suspend_enter,
+};
+
+static void __init psci_init_system_suspend(void)
+{
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_SUSPEND))
+		return;
+
+	ret = psci_features(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND));
+
+	if (ret != PSCI_RET_NOT_SUPPORTED)
+		suspend_set_ops(&psci_suspend_ops);
+}
+
+static void __init psci_init_cpu_suspend(void)
+{
+	int feature = psci_features(psci_function_id[PSCI_FN_CPU_SUSPEND]);
+
+	if (feature != PSCI_RET_NOT_SUPPORTED)
+		psci_cpu_suspend_feature = feature;
+}
+
+/*
+ * Detect the presence of a resident Trusted OS which may cause CPU_OFF to
+ * return DENIED (which would be fatal).
+ */
+static void __init psci_init_migrate(void)
+{
+	unsigned long cpuid;
+	int type, cpu = -1;
+
+	type = psci_ops.migrate_info_type();
+
+	if (type == PSCI_0_2_TOS_MP) {
+		pr_info("Trusted OS migration not required\n");
+		return;
+	}
+
+	if (type == PSCI_RET_NOT_SUPPORTED) {
+		pr_info("MIGRATE_INFO_TYPE not supported.\n");
+		return;
+	}
+
+	if (type != PSCI_0_2_TOS_UP_MIGRATE &&
+	    type != PSCI_0_2_TOS_UP_NO_MIGRATE) {
+		pr_err("MIGRATE_INFO_TYPE returned unknown type (%d)\n", type);
+		return;
+	}
+
+	cpuid = psci_migrate_info_up_cpu();
+	if (cpuid & ~MPIDR_HWID_BITMASK) {
+		pr_warn("MIGRATE_INFO_UP_CPU reported invalid physical ID (0x%lx)\n",
+			cpuid);
+		return;
+	}
+
+	cpu = get_logical_index(cpuid);
+	resident_cpu = cpu >= 0 ? cpu : -1;
+
+	pr_info("Trusted OS resident on physical CPU 0x%lx\n", cpuid);
+}
+
+static void __init psci_0_2_set_functions(void)
+{
+	pr_info("Using standard PSCI v0.2 function IDs\n");
+	psci_function_id[PSCI_FN_CPU_SUSPEND] =
+					PSCI_FN_NATIVE(0_2, CPU_SUSPEND);
+	psci_ops.cpu_suspend = psci_cpu_suspend;
+
+	psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
+	psci_ops.cpu_off = psci_cpu_off;
+
+	psci_function_id[PSCI_FN_CPU_ON] = PSCI_FN_NATIVE(0_2, CPU_ON);
+	psci_ops.cpu_on = psci_cpu_on;
+
+	psci_function_id[PSCI_FN_MIGRATE] = PSCI_FN_NATIVE(0_2, MIGRATE);
+	psci_ops.migrate = psci_migrate;
+
+	psci_ops.affinity_info = psci_affinity_info;
+
+	psci_ops.migrate_info_type = psci_migrate_info_type;
+
+	arm_pm_restart = psci_sys_reset;
+
+	pm_power_off = psci_sys_poweroff;
+}
+
+/*
+ * Probe function for PSCI firmware versions >= 0.2
+ */
+static int __init psci_probe(void)
+{
+	u32 ver = psci_get_version();
+
+	pr_info("PSCIv%d.%d detected in firmware.\n",
+			PSCI_VERSION_MAJOR(ver),
+			PSCI_VERSION_MINOR(ver));
+
+	if (PSCI_VERSION_MAJOR(ver) == 0 && PSCI_VERSION_MINOR(ver) < 2) {
+		pr_err("Conflicting PSCI version detected.\n");
+		return -EINVAL;
+	}
+
+	psci_0_2_set_functions();
+
+	psci_init_migrate();
+
+	if (PSCI_VERSION_MAJOR(ver) >= 1) {
+		psci_init_cpu_suspend();
+		psci_init_system_suspend();
+	}
+
+	return 0;
+}
+
+typedef int (*psci_initcall_t)(const struct device_node *);
+
+/*
+ * PSCI init function for PSCI versions >=0.2
+ *
+ * Probe based on PSCI PSCI_VERSION function
+ */
+static int __init psci_0_2_init(struct device_node *np)
+{
+	int err;
+
+	err = get_set_conduit_method(np);
+
+	if (err)
+		goto out_put_node;
+	/*
+	 * Starting with v0.2, the PSCI specification introduced a call
+	 * (PSCI_VERSION) that allows probing the firmware version, so
+	 * that PSCI function IDs and version specific initialization
+	 * can be carried out according to the specific version reported
+	 * by firmware
+	 */
+	err = psci_probe();
+
+out_put_node:
+	of_node_put(np);
+	return err;
+}
+
+/*
+ * PSCI < v0.2 get PSCI Function IDs via DT.
+ */
+static int __init psci_0_1_init(struct device_node *np)
+{
+	u32 id;
+	int err;
+
+	err = get_set_conduit_method(np);
+
+	if (err)
+		goto out_put_node;
+
+	pr_info("Using PSCI v0.1 Function IDs from DT\n");
+
+	if (!of_property_read_u32(np, "cpu_suspend", &id)) {
+		psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
+		psci_ops.cpu_suspend = psci_cpu_suspend;
+	}
+
+	if (!of_property_read_u32(np, "cpu_off", &id)) {
+		psci_function_id[PSCI_FN_CPU_OFF] = id;
+		psci_ops.cpu_off = psci_cpu_off;
+	}
+
+	if (!of_property_read_u32(np, "cpu_on", &id)) {
+		psci_function_id[PSCI_FN_CPU_ON] = id;
+		psci_ops.cpu_on = psci_cpu_on;
+	}
+
+	if (!of_property_read_u32(np, "migrate", &id)) {
+		psci_function_id[PSCI_FN_MIGRATE] = id;
+		psci_ops.migrate = psci_migrate;
+	}
+
+out_put_node:
+	of_node_put(np);
+	return err;
+}
+
+static const struct of_device_id psci_of_match[] __initconst = {
+	{ .compatible = "arm,psci",	.data = psci_0_1_init},
+	{ .compatible = "arm,psci-0.2",	.data = psci_0_2_init},
+	{ .compatible = "arm,psci-1.0",	.data = psci_0_2_init},
+	{},
+};
+
+int __init psci_dt_init(void)
+{
+	struct device_node *np;
+	const struct of_device_id *matched_np;
+	psci_initcall_t init_fn;
+
+	np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
+
+	if (!np)
+		return -ENODEV;
+
+	init_fn = (psci_initcall_t)matched_np->data;
+	return init_fn(np);
+}
+
+#ifdef CONFIG_ACPI
+/*
+ * We use PSCI 0.2+ when ACPI is deployed on ARM64 and it's
+ * explicitly clarified in SBBR
+ */
+int __init psci_acpi_init(void)
+{
+	if (!acpi_psci_present()) {
+		pr_info("is not implemented in ACPI.\n");
+		return -EOPNOTSUPP;
+	}
+
+	pr_info("probing for conduit method from ACPI.\n");
+
+	if (acpi_psci_use_hvc())
+		invoke_psci_fn = __invoke_psci_fn_hvc;
+	else
+		invoke_psci_fn = __invoke_psci_fn_smc;
+
+	return psci_probe();
+}
+#endif
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 1ae4e547b419..388b48697239 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -23,7 +23,7 @@ config IOMMU_IO_PGTABLE
 config IOMMU_IO_PGTABLE_LPAE
 	bool "ARMv7/v8 Long Descriptor Format"
 	select IOMMU_IO_PGTABLE
-	depends on ARM || ARM64 || COMPILE_TEST
+	depends on HAS_DMA && (ARM || ARM64 || COMPILE_TEST)
 	help
 	  Enable support for the ARM long descriptor pagetable format.
 	  This allocator supports 4K/2M/1G, 16K/32M and 64K/512M page
@@ -48,6 +48,13 @@ config OF_IOMMU
        def_bool y
        depends on OF && IOMMU_API
 
+# IOMMU-agnostic DMA-mapping layer
+config IOMMU_DMA
+	bool
+	depends on NEED_SG_DMA_LENGTH
+	select IOMMU_API
+	select IOMMU_IOVA
+
 config FSL_PAMU
 	bool "Freescale IOMMU support"
 	depends on PPC32
@@ -339,6 +346,7 @@ config SPAPR_TCE_IOMMU
 	  Enables bits of IOMMU API required by VFIO. The iommu_ops
 	  is not implemented as it is not necessary for VFIO.
 
+# ARM IOMMU support
 config ARM_SMMU
 	bool "ARM Ltd. System MMU (SMMU) Support"
 	depends on (ARM64 || ARM) && MMU
@@ -352,4 +360,16 @@ config ARM_SMMU
 	  Say Y here if your SoC includes an IOMMU device implementing
 	  the ARM SMMU architecture.
 
+config ARM_SMMU_V3
+	bool "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
+	depends on ARM64 && PCI
+	select IOMMU_API
+	select IOMMU_IO_PGTABLE_LPAE
+	help
+	  Support for implementations of the ARM System MMU architecture
+	  version 3 providing translation support to a PCIe root complex.
+
+	  Say Y here if your system includes an IOMMU device implementing
+	  the ARM SMMUv3 architecture.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 080ffab4ed1c..f465cfbdb183 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,6 +1,7 @@
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
+obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
@@ -9,6 +10,7 @@ obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
 obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
 obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_ARM_SMMU) += arm-smmu.o
+obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 1750db0ef61c..2669b21bdfee 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -138,7 +138,7 @@ u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 					   to handle */
 LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
 					   we find in ACPI */
-u32 amd_iommu_unmap_flush;		/* if true, flush on every unmap */
+bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
 
 LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
 					   system */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index cbfd0f4c4608..d86bc243a57e 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -681,7 +681,7 @@ extern unsigned long *amd_iommu_pd_alloc_bitmap;
  * If true, the addresses will be flushed on unmap time, not when
  * they are reused
  */
-extern u32 amd_iommu_unmap_flush;
+extern bool amd_iommu_unmap_flush;
 
 /* Smallest max PASID supported by any IOMMU in the system */
 extern u32 amd_iommu_max_pasid;
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
new file mode 100644
index 000000000000..eda8eaf381c9
--- /dev/null
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -0,0 +1,2675 @@
+/*
+ * IOMMU API for ARM architected SMMUv3 implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2015 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ *
+ * This driver is powered by bad coffee and bombay mix.
+ */
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+#include "io-pgtable.h"
+
+/* MMIO registers */
+#define ARM_SMMU_IDR0			0x0
+#define IDR0_ST_LVL_SHIFT		27
+#define IDR0_ST_LVL_MASK		0x3
+#define IDR0_ST_LVL_2LVL		(1 << IDR0_ST_LVL_SHIFT)
+#define IDR0_STALL_MODEL		(3 << 24)
+#define IDR0_TTENDIAN_SHIFT		21
+#define IDR0_TTENDIAN_MASK		0x3
+#define IDR0_TTENDIAN_LE		(2 << IDR0_TTENDIAN_SHIFT)
+#define IDR0_TTENDIAN_BE		(3 << IDR0_TTENDIAN_SHIFT)
+#define IDR0_TTENDIAN_MIXED		(0 << IDR0_TTENDIAN_SHIFT)
+#define IDR0_CD2L			(1 << 19)
+#define IDR0_VMID16			(1 << 18)
+#define IDR0_PRI			(1 << 16)
+#define IDR0_SEV			(1 << 14)
+#define IDR0_MSI			(1 << 13)
+#define IDR0_ASID16			(1 << 12)
+#define IDR0_ATS			(1 << 10)
+#define IDR0_HYP			(1 << 9)
+#define IDR0_COHACC			(1 << 4)
+#define IDR0_TTF_SHIFT			2
+#define IDR0_TTF_MASK			0x3
+#define IDR0_TTF_AARCH64		(2 << IDR0_TTF_SHIFT)
+#define IDR0_TTF_AARCH32_64		(3 << IDR0_TTF_SHIFT)
+#define IDR0_S1P			(1 << 1)
+#define IDR0_S2P			(1 << 0)
+
+#define ARM_SMMU_IDR1			0x4
+#define IDR1_TABLES_PRESET		(1 << 30)
+#define IDR1_QUEUES_PRESET		(1 << 29)
+#define IDR1_REL			(1 << 28)
+#define IDR1_CMDQ_SHIFT			21
+#define IDR1_CMDQ_MASK			0x1f
+#define IDR1_EVTQ_SHIFT			16
+#define IDR1_EVTQ_MASK			0x1f
+#define IDR1_PRIQ_SHIFT			11
+#define IDR1_PRIQ_MASK			0x1f
+#define IDR1_SSID_SHIFT			6
+#define IDR1_SSID_MASK			0x1f
+#define IDR1_SID_SHIFT			0
+#define IDR1_SID_MASK			0x3f
+
+#define ARM_SMMU_IDR5			0x14
+#define IDR5_STALL_MAX_SHIFT		16
+#define IDR5_STALL_MAX_MASK		0xffff
+#define IDR5_GRAN64K			(1 << 6)
+#define IDR5_GRAN16K			(1 << 5)
+#define IDR5_GRAN4K			(1 << 4)
+#define IDR5_OAS_SHIFT			0
+#define IDR5_OAS_MASK			0x7
+#define IDR5_OAS_32_BIT			(0 << IDR5_OAS_SHIFT)
+#define IDR5_OAS_36_BIT			(1 << IDR5_OAS_SHIFT)
+#define IDR5_OAS_40_BIT			(2 << IDR5_OAS_SHIFT)
+#define IDR5_OAS_42_BIT			(3 << IDR5_OAS_SHIFT)
+#define IDR5_OAS_44_BIT			(4 << IDR5_OAS_SHIFT)
+#define IDR5_OAS_48_BIT			(5 << IDR5_OAS_SHIFT)
+
+#define ARM_SMMU_CR0			0x20
+#define CR0_CMDQEN			(1 << 3)
+#define CR0_EVTQEN			(1 << 2)
+#define CR0_PRIQEN			(1 << 1)
+#define CR0_SMMUEN			(1 << 0)
+
+#define ARM_SMMU_CR0ACK			0x24
+
+#define ARM_SMMU_CR1			0x28
+#define CR1_SH_NSH			0
+#define CR1_SH_OSH			2
+#define CR1_SH_ISH			3
+#define CR1_CACHE_NC			0
+#define CR1_CACHE_WB			1
+#define CR1_CACHE_WT			2
+#define CR1_TABLE_SH_SHIFT		10
+#define CR1_TABLE_OC_SHIFT		8
+#define CR1_TABLE_IC_SHIFT		6
+#define CR1_QUEUE_SH_SHIFT		4
+#define CR1_QUEUE_OC_SHIFT		2
+#define CR1_QUEUE_IC_SHIFT		0
+
+#define ARM_SMMU_CR2			0x2c
+#define CR2_PTM				(1 << 2)
+#define CR2_RECINVSID			(1 << 1)
+#define CR2_E2H				(1 << 0)
+
+#define ARM_SMMU_IRQ_CTRL		0x50
+#define IRQ_CTRL_EVTQ_IRQEN		(1 << 2)
+#define IRQ_CTRL_PRIQ_IRQEN		(1 << 1)
+#define IRQ_CTRL_GERROR_IRQEN		(1 << 0)
+
+#define ARM_SMMU_IRQ_CTRLACK		0x54
+
+#define ARM_SMMU_GERROR			0x60
+#define GERROR_SFM_ERR			(1 << 8)
+#define GERROR_MSI_GERROR_ABT_ERR	(1 << 7)
+#define GERROR_MSI_PRIQ_ABT_ERR		(1 << 6)
+#define GERROR_MSI_EVTQ_ABT_ERR		(1 << 5)
+#define GERROR_MSI_CMDQ_ABT_ERR		(1 << 4)
+#define GERROR_PRIQ_ABT_ERR		(1 << 3)
+#define GERROR_EVTQ_ABT_ERR		(1 << 2)
+#define GERROR_CMDQ_ERR			(1 << 0)
+#define GERROR_ERR_MASK			0xfd
+
+#define ARM_SMMU_GERRORN		0x64
+
+#define ARM_SMMU_GERROR_IRQ_CFG0	0x68
+#define ARM_SMMU_GERROR_IRQ_CFG1	0x70
+#define ARM_SMMU_GERROR_IRQ_CFG2	0x74
+
+#define ARM_SMMU_STRTAB_BASE		0x80
+#define STRTAB_BASE_RA			(1UL << 62)
+#define STRTAB_BASE_ADDR_SHIFT		6
+#define STRTAB_BASE_ADDR_MASK		0x3ffffffffffUL
+
+#define ARM_SMMU_STRTAB_BASE_CFG	0x88
+#define STRTAB_BASE_CFG_LOG2SIZE_SHIFT	0
+#define STRTAB_BASE_CFG_LOG2SIZE_MASK	0x3f
+#define STRTAB_BASE_CFG_SPLIT_SHIFT	6
+#define STRTAB_BASE_CFG_SPLIT_MASK	0x1f
+#define STRTAB_BASE_CFG_FMT_SHIFT	16
+#define STRTAB_BASE_CFG_FMT_MASK	0x3
+#define STRTAB_BASE_CFG_FMT_LINEAR	(0 << STRTAB_BASE_CFG_FMT_SHIFT)
+#define STRTAB_BASE_CFG_FMT_2LVL	(1 << STRTAB_BASE_CFG_FMT_SHIFT)
+
+#define ARM_SMMU_CMDQ_BASE		0x90
+#define ARM_SMMU_CMDQ_PROD		0x98
+#define ARM_SMMU_CMDQ_CONS		0x9c
+
+#define ARM_SMMU_EVTQ_BASE		0xa0
+#define ARM_SMMU_EVTQ_PROD		0x100a8
+#define ARM_SMMU_EVTQ_CONS		0x100ac
+#define ARM_SMMU_EVTQ_IRQ_CFG0		0xb0
+#define ARM_SMMU_EVTQ_IRQ_CFG1		0xb8
+#define ARM_SMMU_EVTQ_IRQ_CFG2		0xbc
+
+#define ARM_SMMU_PRIQ_BASE		0xc0
+#define ARM_SMMU_PRIQ_PROD		0x100c8
+#define ARM_SMMU_PRIQ_CONS		0x100cc
+#define ARM_SMMU_PRIQ_IRQ_CFG0		0xd0
+#define ARM_SMMU_PRIQ_IRQ_CFG1		0xd8
+#define ARM_SMMU_PRIQ_IRQ_CFG2		0xdc
+
+/* Common MSI config fields */
+#define MSI_CFG0_ADDR_SHIFT		2
+#define MSI_CFG0_ADDR_MASK		0x3fffffffffffUL
+#define MSI_CFG2_SH_SHIFT		4
+#define MSI_CFG2_SH_NSH			(0UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_SH_OSH			(2UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_SH_ISH			(3UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_MEMATTR_SHIFT		0
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE	(0x1 << MSI_CFG2_MEMATTR_SHIFT)
+
+#define Q_IDX(q, p)			((p) & ((1 << (q)->max_n_shift) - 1))
+#define Q_WRP(q, p)			((p) & (1 << (q)->max_n_shift))
+#define Q_OVERFLOW_FLAG			(1 << 31)
+#define Q_OVF(q, p)			((p) & Q_OVERFLOW_FLAG)
+#define Q_ENT(q, p)			((q)->base +			\
+					 Q_IDX(q, p) * (q)->ent_dwords)
+
+#define Q_BASE_RWA			(1UL << 62)
+#define Q_BASE_ADDR_SHIFT		5
+#define Q_BASE_ADDR_MASK		0xfffffffffffUL
+#define Q_BASE_LOG2SIZE_SHIFT		0
+#define Q_BASE_LOG2SIZE_MASK		0x1fUL
+
+/*
+ * Stream table.
+ *
+ * Linear: Enough to cover 1 << IDR1.SIDSIZE entries
+ * 2lvl: 128k L1 entries,
+ *       256 lazy entries per table (each table covers a PCI bus)
+ */
+#define STRTAB_L1_SZ_SHIFT		20
+#define STRTAB_SPLIT			8
+
+#define STRTAB_L1_DESC_DWORDS		1
+#define STRTAB_L1_DESC_SPAN_SHIFT	0
+#define STRTAB_L1_DESC_SPAN_MASK	0x1fUL
+#define STRTAB_L1_DESC_L2PTR_SHIFT	6
+#define STRTAB_L1_DESC_L2PTR_MASK	0x3ffffffffffUL
+
+#define STRTAB_STE_DWORDS		8
+#define STRTAB_STE_0_V			(1UL << 0)
+#define STRTAB_STE_0_CFG_SHIFT		1
+#define STRTAB_STE_0_CFG_MASK		0x7UL
+#define STRTAB_STE_0_CFG_ABORT		(0UL << STRTAB_STE_0_CFG_SHIFT)
+#define STRTAB_STE_0_CFG_BYPASS		(4UL << STRTAB_STE_0_CFG_SHIFT)
+#define STRTAB_STE_0_CFG_S1_TRANS	(5UL << STRTAB_STE_0_CFG_SHIFT)
+#define STRTAB_STE_0_CFG_S2_TRANS	(6UL << STRTAB_STE_0_CFG_SHIFT)
+
+#define STRTAB_STE_0_S1FMT_SHIFT	4
+#define STRTAB_STE_0_S1FMT_LINEAR	(0UL << STRTAB_STE_0_S1FMT_SHIFT)
+#define STRTAB_STE_0_S1CTXPTR_SHIFT	6
+#define STRTAB_STE_0_S1CTXPTR_MASK	0x3ffffffffffUL
+#define STRTAB_STE_0_S1CDMAX_SHIFT	59
+#define STRTAB_STE_0_S1CDMAX_MASK	0x1fUL
+
+#define STRTAB_STE_1_S1C_CACHE_NC	0UL
+#define STRTAB_STE_1_S1C_CACHE_WBRA	1UL
+#define STRTAB_STE_1_S1C_CACHE_WT	2UL
+#define STRTAB_STE_1_S1C_CACHE_WB	3UL
+#define STRTAB_STE_1_S1C_SH_NSH		0UL
+#define STRTAB_STE_1_S1C_SH_OSH		2UL
+#define STRTAB_STE_1_S1C_SH_ISH		3UL
+#define STRTAB_STE_1_S1CIR_SHIFT	2
+#define STRTAB_STE_1_S1COR_SHIFT	4
+#define STRTAB_STE_1_S1CSH_SHIFT	6
+
+#define STRTAB_STE_1_S1STALLD		(1UL << 27)
+
+#define STRTAB_STE_1_EATS_ABT		0UL
+#define STRTAB_STE_1_EATS_TRANS		1UL
+#define STRTAB_STE_1_EATS_S1CHK		2UL
+#define STRTAB_STE_1_EATS_SHIFT		28
+
+#define STRTAB_STE_1_STRW_NSEL1		0UL
+#define STRTAB_STE_1_STRW_EL2		2UL
+#define STRTAB_STE_1_STRW_SHIFT		30
+
+#define STRTAB_STE_2_S2VMID_SHIFT	0
+#define STRTAB_STE_2_S2VMID_MASK	0xffffUL
+#define STRTAB_STE_2_VTCR_SHIFT		32
+#define STRTAB_STE_2_VTCR_MASK		0x7ffffUL
+#define STRTAB_STE_2_S2AA64		(1UL << 51)
+#define STRTAB_STE_2_S2ENDI		(1UL << 52)
+#define STRTAB_STE_2_S2PTW		(1UL << 54)
+#define STRTAB_STE_2_S2R		(1UL << 58)
+
+#define STRTAB_STE_3_S2TTB_SHIFT	4
+#define STRTAB_STE_3_S2TTB_MASK		0xfffffffffffUL
+
+/* Context descriptor (stage-1 only) */
+#define CTXDESC_CD_DWORDS		8
+#define CTXDESC_CD_0_TCR_T0SZ_SHIFT	0
+#define ARM64_TCR_T0SZ_SHIFT		0
+#define ARM64_TCR_T0SZ_MASK		0x1fUL
+#define CTXDESC_CD_0_TCR_TG0_SHIFT	6
+#define ARM64_TCR_TG0_SHIFT		14
+#define ARM64_TCR_TG0_MASK		0x3UL
+#define CTXDESC_CD_0_TCR_IRGN0_SHIFT	8
+#define ARM64_TCR_IRGN0_SHIFT		8
+#define ARM64_TCR_IRGN0_MASK		0x3UL
+#define CTXDESC_CD_0_TCR_ORGN0_SHIFT	10
+#define ARM64_TCR_ORGN0_SHIFT		10
+#define ARM64_TCR_ORGN0_MASK		0x3UL
+#define CTXDESC_CD_0_TCR_SH0_SHIFT	12
+#define ARM64_TCR_SH0_SHIFT		12
+#define ARM64_TCR_SH0_MASK		0x3UL
+#define CTXDESC_CD_0_TCR_EPD0_SHIFT	14
+#define ARM64_TCR_EPD0_SHIFT		7
+#define ARM64_TCR_EPD0_MASK		0x1UL
+#define CTXDESC_CD_0_TCR_EPD1_SHIFT	30
+#define ARM64_TCR_EPD1_SHIFT		23
+#define ARM64_TCR_EPD1_MASK		0x1UL
+
+#define CTXDESC_CD_0_ENDI		(1UL << 15)
+#define CTXDESC_CD_0_V			(1UL << 31)
+
+#define CTXDESC_CD_0_TCR_IPS_SHIFT	32
+#define ARM64_TCR_IPS_SHIFT		32
+#define ARM64_TCR_IPS_MASK		0x7UL
+#define CTXDESC_CD_0_TCR_TBI0_SHIFT	38
+#define ARM64_TCR_TBI0_SHIFT		37
+#define ARM64_TCR_TBI0_MASK		0x1UL
+
+#define CTXDESC_CD_0_AA64		(1UL << 41)
+#define CTXDESC_CD_0_R			(1UL << 45)
+#define CTXDESC_CD_0_A			(1UL << 46)
+#define CTXDESC_CD_0_ASET_SHIFT		47
+#define CTXDESC_CD_0_ASET_SHARED	(0UL << CTXDESC_CD_0_ASET_SHIFT)
+#define CTXDESC_CD_0_ASET_PRIVATE	(1UL << CTXDESC_CD_0_ASET_SHIFT)
+#define CTXDESC_CD_0_ASID_SHIFT		48
+#define CTXDESC_CD_0_ASID_MASK		0xffffUL
+
+#define CTXDESC_CD_1_TTB0_SHIFT		4
+#define CTXDESC_CD_1_TTB0_MASK		0xfffffffffffUL
+
+#define CTXDESC_CD_3_MAIR_SHIFT		0
+
+/* Convert between AArch64 (CPU) TCR format and SMMU CD format */
+#define ARM_SMMU_TCR2CD(tcr, fld)					\
+	(((tcr) >> ARM64_TCR_##fld##_SHIFT & ARM64_TCR_##fld##_MASK)	\
+	 << CTXDESC_CD_0_TCR_##fld##_SHIFT)
+
+/* Command queue */
+#define CMDQ_ENT_DWORDS			2
+#define CMDQ_MAX_SZ_SHIFT		8
+
+#define CMDQ_ERR_SHIFT			24
+#define CMDQ_ERR_MASK			0x7f
+#define CMDQ_ERR_CERROR_NONE_IDX	0
+#define CMDQ_ERR_CERROR_ILL_IDX		1
+#define CMDQ_ERR_CERROR_ABT_IDX		2
+
+#define CMDQ_0_OP_SHIFT			0
+#define CMDQ_0_OP_MASK			0xffUL
+#define CMDQ_0_SSV			(1UL << 11)
+
+#define CMDQ_PREFETCH_0_SID_SHIFT	32
+#define CMDQ_PREFETCH_1_SIZE_SHIFT	0
+#define CMDQ_PREFETCH_1_ADDR_MASK	~0xfffUL
+
+#define CMDQ_CFGI_0_SID_SHIFT		32
+#define CMDQ_CFGI_0_SID_MASK		0xffffffffUL
+#define CMDQ_CFGI_1_LEAF		(1UL << 0)
+#define CMDQ_CFGI_1_RANGE_SHIFT		0
+#define CMDQ_CFGI_1_RANGE_MASK		0x1fUL
+
+#define CMDQ_TLBI_0_VMID_SHIFT		32
+#define CMDQ_TLBI_0_ASID_SHIFT		48
+#define CMDQ_TLBI_1_LEAF		(1UL << 0)
+#define CMDQ_TLBI_1_VA_MASK		~0xfffUL
+#define CMDQ_TLBI_1_IPA_MASK		0xfffffffff000UL
+
+#define CMDQ_PRI_0_SSID_SHIFT		12
+#define CMDQ_PRI_0_SSID_MASK		0xfffffUL
+#define CMDQ_PRI_0_SID_SHIFT		32
+#define CMDQ_PRI_0_SID_MASK		0xffffffffUL
+#define CMDQ_PRI_1_GRPID_SHIFT		0
+#define CMDQ_PRI_1_GRPID_MASK		0x1ffUL
+#define CMDQ_PRI_1_RESP_SHIFT		12
+#define CMDQ_PRI_1_RESP_DENY		(0UL << CMDQ_PRI_1_RESP_SHIFT)
+#define CMDQ_PRI_1_RESP_FAIL		(1UL << CMDQ_PRI_1_RESP_SHIFT)
+#define CMDQ_PRI_1_RESP_SUCC		(2UL << CMDQ_PRI_1_RESP_SHIFT)
+
+#define CMDQ_SYNC_0_CS_SHIFT		12
+#define CMDQ_SYNC_0_CS_NONE		(0UL << CMDQ_SYNC_0_CS_SHIFT)
+#define CMDQ_SYNC_0_CS_SEV		(2UL << CMDQ_SYNC_0_CS_SHIFT)
+
+/* Event queue */
+#define EVTQ_ENT_DWORDS			4
+#define EVTQ_MAX_SZ_SHIFT		7
+
+#define EVTQ_0_ID_SHIFT			0
+#define EVTQ_0_ID_MASK			0xffUL
+
+/* PRI queue */
+#define PRIQ_ENT_DWORDS			2
+#define PRIQ_MAX_SZ_SHIFT		8
+
+#define PRIQ_0_SID_SHIFT		0
+#define PRIQ_0_SID_MASK			0xffffffffUL
+#define PRIQ_0_SSID_SHIFT		32
+#define PRIQ_0_SSID_MASK		0xfffffUL
+#define PRIQ_0_OF			(1UL << 57)
+#define PRIQ_0_PERM_PRIV		(1UL << 58)
+#define PRIQ_0_PERM_EXEC		(1UL << 59)
+#define PRIQ_0_PERM_READ		(1UL << 60)
+#define PRIQ_0_PERM_WRITE		(1UL << 61)
+#define PRIQ_0_PRG_LAST			(1UL << 62)
+#define PRIQ_0_SSID_V			(1UL << 63)
+
+#define PRIQ_1_PRG_IDX_SHIFT		0
+#define PRIQ_1_PRG_IDX_MASK		0x1ffUL
+#define PRIQ_1_ADDR_SHIFT		12
+#define PRIQ_1_ADDR_MASK		0xfffffffffffffUL
+
+/* High-level queue structures */
+#define ARM_SMMU_POLL_TIMEOUT_US	100
+
+static bool disable_bypass;
+module_param_named(disable_bypass, disable_bypass, bool, S_IRUGO);
+MODULE_PARM_DESC(disable_bypass,
+	"Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
+
+enum pri_resp {
+	PRI_RESP_DENY,
+	PRI_RESP_FAIL,
+	PRI_RESP_SUCC,
+};
+
+struct arm_smmu_cmdq_ent {
+	/* Common fields */
+	u8				opcode;
+	bool				substream_valid;
+
+	/* Command-specific fields */
+	union {
+		#define CMDQ_OP_PREFETCH_CFG	0x1
+		struct {
+			u32			sid;
+			u8			size;
+			u64			addr;
+		} prefetch;
+
+		#define CMDQ_OP_CFGI_STE	0x3
+		#define CMDQ_OP_CFGI_ALL	0x4
+		struct {
+			u32			sid;
+			union {
+				bool		leaf;
+				u8		span;
+			};
+		} cfgi;
+
+		#define CMDQ_OP_TLBI_NH_ASID	0x11
+		#define CMDQ_OP_TLBI_NH_VA	0x12
+		#define CMDQ_OP_TLBI_EL2_ALL	0x20
+		#define CMDQ_OP_TLBI_S12_VMALL	0x28
+		#define CMDQ_OP_TLBI_S2_IPA	0x2a
+		#define CMDQ_OP_TLBI_NSNH_ALL	0x30
+		struct {
+			u16			asid;
+			u16			vmid;
+			bool			leaf;
+			u64			addr;
+		} tlbi;
+
+		#define CMDQ_OP_PRI_RESP	0x41
+		struct {
+			u32			sid;
+			u32			ssid;
+			u16			grpid;
+			enum pri_resp		resp;
+		} pri;
+
+		#define CMDQ_OP_CMD_SYNC	0x46
+	};
+};
+
+struct arm_smmu_queue {
+	int				irq; /* Wired interrupt */
+
+	__le64				*base;
+	dma_addr_t			base_dma;
+	u64				q_base;
+
+	size_t				ent_dwords;
+	u32				max_n_shift;
+	u32				prod;
+	u32				cons;
+
+	u32 __iomem			*prod_reg;
+	u32 __iomem			*cons_reg;
+};
+
+struct arm_smmu_cmdq {
+	struct arm_smmu_queue		q;
+	spinlock_t			lock;
+};
+
+struct arm_smmu_evtq {
+	struct arm_smmu_queue		q;
+	u32				max_stalls;
+};
+
+struct arm_smmu_priq {
+	struct arm_smmu_queue		q;
+};
+
+/* High-level stream table and context descriptor structures */
+struct arm_smmu_strtab_l1_desc {
+	u8				span;
+
+	__le64				*l2ptr;
+	dma_addr_t			l2ptr_dma;
+};
+
+struct arm_smmu_s1_cfg {
+	__le64				*cdptr;
+	dma_addr_t			cdptr_dma;
+
+	struct arm_smmu_ctx_desc {
+		u16	asid;
+		u64	ttbr;
+		u64	tcr;
+		u64	mair;
+	}				cd;
+};
+
+struct arm_smmu_s2_cfg {
+	u16				vmid;
+	u64				vttbr;
+	u64				vtcr;
+};
+
+struct arm_smmu_strtab_ent {
+	bool				valid;
+
+	bool				bypass;	/* Overrides s1/s2 config */
+	struct arm_smmu_s1_cfg		*s1_cfg;
+	struct arm_smmu_s2_cfg		*s2_cfg;
+};
+
+struct arm_smmu_strtab_cfg {
+	__le64				*strtab;
+	dma_addr_t			strtab_dma;
+	struct arm_smmu_strtab_l1_desc	*l1_desc;
+	unsigned int			num_l1_ents;
+
+	u64				strtab_base;
+	u32				strtab_base_cfg;
+};
+
+/* An SMMUv3 instance */
+struct arm_smmu_device {
+	struct device			*dev;
+	void __iomem			*base;
+
+#define ARM_SMMU_FEAT_2_LVL_STRTAB	(1 << 0)
+#define ARM_SMMU_FEAT_2_LVL_CDTAB	(1 << 1)
+#define ARM_SMMU_FEAT_TT_LE		(1 << 2)
+#define ARM_SMMU_FEAT_TT_BE		(1 << 3)
+#define ARM_SMMU_FEAT_PRI		(1 << 4)
+#define ARM_SMMU_FEAT_ATS		(1 << 5)
+#define ARM_SMMU_FEAT_SEV		(1 << 6)
+#define ARM_SMMU_FEAT_MSI		(1 << 7)
+#define ARM_SMMU_FEAT_COHERENCY		(1 << 8)
+#define ARM_SMMU_FEAT_TRANS_S1		(1 << 9)
+#define ARM_SMMU_FEAT_TRANS_S2		(1 << 10)
+#define ARM_SMMU_FEAT_STALLS		(1 << 11)
+#define ARM_SMMU_FEAT_HYP		(1 << 12)
+	u32				features;
+
+#define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
+	u32				options;
+
+	struct arm_smmu_cmdq		cmdq;
+	struct arm_smmu_evtq		evtq;
+	struct arm_smmu_priq		priq;
+
+	int				gerr_irq;
+
+	unsigned long			ias; /* IPA */
+	unsigned long			oas; /* PA */
+
+#define ARM_SMMU_MAX_ASIDS		(1 << 16)
+	unsigned int			asid_bits;
+	DECLARE_BITMAP(asid_map, ARM_SMMU_MAX_ASIDS);
+
+#define ARM_SMMU_MAX_VMIDS		(1 << 16)
+	unsigned int			vmid_bits;
+	DECLARE_BITMAP(vmid_map, ARM_SMMU_MAX_VMIDS);
+
+	unsigned int			ssid_bits;
+	unsigned int			sid_bits;
+
+	struct arm_smmu_strtab_cfg	strtab_cfg;
+};
+
+/* SMMU private data for an IOMMU group */
+struct arm_smmu_group {
+	struct arm_smmu_device		*smmu;
+	struct arm_smmu_domain		*domain;
+	int				num_sids;
+	u32				*sids;
+	struct arm_smmu_strtab_ent	ste;
+};
+
+/* SMMU private data for an IOMMU domain */
+enum arm_smmu_domain_stage {
+	ARM_SMMU_DOMAIN_S1 = 0,
+	ARM_SMMU_DOMAIN_S2,
+	ARM_SMMU_DOMAIN_NESTED,
+};
+
+struct arm_smmu_domain {
+	struct arm_smmu_device		*smmu;
+	struct mutex			init_mutex; /* Protects smmu pointer */
+
+	struct io_pgtable_ops		*pgtbl_ops;
+	spinlock_t			pgtbl_lock;
+
+	enum arm_smmu_domain_stage	stage;
+	union {
+		struct arm_smmu_s1_cfg	s1_cfg;
+		struct arm_smmu_s2_cfg	s2_cfg;
+	};
+
+	struct iommu_domain		domain;
+};
+
+struct arm_smmu_option_prop {
+	u32 opt;
+	const char *prop;
+};
+
+static struct arm_smmu_option_prop arm_smmu_options[] = {
+	{ ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
+	{ 0, NULL},
+};
+
+static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct arm_smmu_domain, domain);
+}
+
+static void parse_driver_options(struct arm_smmu_device *smmu)
+{
+	int i = 0;
+
+	do {
+		if (of_property_read_bool(smmu->dev->of_node,
+						arm_smmu_options[i].prop)) {
+			smmu->options |= arm_smmu_options[i].opt;
+			dev_notice(smmu->dev, "option %s\n",
+				arm_smmu_options[i].prop);
+		}
+	} while (arm_smmu_options[++i].opt);
+}
+
+/* Low-level queue manipulation functions */
+static bool queue_full(struct arm_smmu_queue *q)
+{
+	return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
+	       Q_WRP(q, q->prod) != Q_WRP(q, q->cons);
+}
+
+static bool queue_empty(struct arm_smmu_queue *q)
+{
+	return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
+	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
+}
+
+static void queue_sync_cons(struct arm_smmu_queue *q)
+{
+	q->cons = readl_relaxed(q->cons_reg);
+}
+
+static void queue_inc_cons(struct arm_smmu_queue *q)
+{
+	u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
+
+	q->cons = Q_OVF(q, q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
+	writel(q->cons, q->cons_reg);
+}
+
+static int queue_sync_prod(struct arm_smmu_queue *q)
+{
+	int ret = 0;
+	u32 prod = readl_relaxed(q->prod_reg);
+
+	if (Q_OVF(q, prod) != Q_OVF(q, q->prod))
+		ret = -EOVERFLOW;
+
+	q->prod = prod;
+	return ret;
+}
+
+static void queue_inc_prod(struct arm_smmu_queue *q)
+{
+	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
+
+	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+	writel(q->prod, q->prod_reg);
+}
+
+static bool __queue_cons_before(struct arm_smmu_queue *q, u32 until)
+{
+	if (Q_WRP(q, q->cons) == Q_WRP(q, until))
+		return Q_IDX(q, q->cons) < Q_IDX(q, until);
+
+	return Q_IDX(q, q->cons) >= Q_IDX(q, until);
+}
+
+static int queue_poll_cons(struct arm_smmu_queue *q, u32 until, bool wfe)
+{
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
+
+	while (queue_sync_cons(q), __queue_cons_before(q, until)) {
+		if (ktime_compare(ktime_get(), timeout) > 0)
+			return -ETIMEDOUT;
+
+		if (wfe) {
+			wfe();
+		} else {
+			cpu_relax();
+			udelay(1);
+		}
+	}
+
+	return 0;
+}
+
+static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
+{
+	int i;
+
+	for (i = 0; i < n_dwords; ++i)
+		*dst++ = cpu_to_le64(*src++);
+}
+
+static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
+{
+	if (queue_full(q))
+		return -ENOSPC;
+
+	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
+	queue_inc_prod(q);
+	return 0;
+}
+
+static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
+{
+	int i;
+
+	for (i = 0; i < n_dwords; ++i)
+		*dst++ = le64_to_cpu(*src++);
+}
+
+static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
+{
+	if (queue_empty(q))
+		return -EAGAIN;
+
+	queue_read(ent, Q_ENT(q, q->cons), q->ent_dwords);
+	queue_inc_cons(q);
+	return 0;
+}
+
+/* High-level queue accessors */
+static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
+{
+	memset(cmd, 0, CMDQ_ENT_DWORDS << 3);
+	cmd[0] |= (ent->opcode & CMDQ_0_OP_MASK) << CMDQ_0_OP_SHIFT;
+
+	switch (ent->opcode) {
+	case CMDQ_OP_TLBI_EL2_ALL:
+	case CMDQ_OP_TLBI_NSNH_ALL:
+		break;
+	case CMDQ_OP_PREFETCH_CFG:
+		cmd[0] |= (u64)ent->prefetch.sid << CMDQ_PREFETCH_0_SID_SHIFT;
+		cmd[1] |= ent->prefetch.size << CMDQ_PREFETCH_1_SIZE_SHIFT;
+		cmd[1] |= ent->prefetch.addr & CMDQ_PREFETCH_1_ADDR_MASK;
+		break;
+	case CMDQ_OP_CFGI_STE:
+		cmd[0] |= (u64)ent->cfgi.sid << CMDQ_CFGI_0_SID_SHIFT;
+		cmd[1] |= ent->cfgi.leaf ? CMDQ_CFGI_1_LEAF : 0;
+		break;
+	case CMDQ_OP_CFGI_ALL:
+		/* Cover the entire SID range */
+		cmd[1] |= CMDQ_CFGI_1_RANGE_MASK << CMDQ_CFGI_1_RANGE_SHIFT;
+		break;
+	case CMDQ_OP_TLBI_NH_VA:
+		cmd[0] |= (u64)ent->tlbi.asid << CMDQ_TLBI_0_ASID_SHIFT;
+		cmd[1] |= ent->tlbi.leaf ? CMDQ_TLBI_1_LEAF : 0;
+		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
+		break;
+	case CMDQ_OP_TLBI_S2_IPA:
+		cmd[0] |= (u64)ent->tlbi.vmid << CMDQ_TLBI_0_VMID_SHIFT;
+		cmd[1] |= ent->tlbi.leaf ? CMDQ_TLBI_1_LEAF : 0;
+		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
+		break;
+	case CMDQ_OP_TLBI_NH_ASID:
+		cmd[0] |= (u64)ent->tlbi.asid << CMDQ_TLBI_0_ASID_SHIFT;
+		/* Fallthrough */
+	case CMDQ_OP_TLBI_S12_VMALL:
+		cmd[0] |= (u64)ent->tlbi.vmid << CMDQ_TLBI_0_VMID_SHIFT;
+		break;
+	case CMDQ_OP_PRI_RESP:
+		cmd[0] |= ent->substream_valid ? CMDQ_0_SSV : 0;
+		cmd[0] |= ent->pri.ssid << CMDQ_PRI_0_SSID_SHIFT;
+		cmd[0] |= (u64)ent->pri.sid << CMDQ_PRI_0_SID_SHIFT;
+		cmd[1] |= ent->pri.grpid << CMDQ_PRI_1_GRPID_SHIFT;
+		switch (ent->pri.resp) {
+		case PRI_RESP_DENY:
+			cmd[1] |= CMDQ_PRI_1_RESP_DENY;
+			break;
+		case PRI_RESP_FAIL:
+			cmd[1] |= CMDQ_PRI_1_RESP_FAIL;
+			break;
+		case PRI_RESP_SUCC:
+			cmd[1] |= CMDQ_PRI_1_RESP_SUCC;
+			break;
+		default:
+			return -EINVAL;
+		}
+		break;
+	case CMDQ_OP_CMD_SYNC:
+		cmd[0] |= CMDQ_SYNC_0_CS_SEV;
+		break;
+	default:
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
+{
+	static const char *cerror_str[] = {
+		[CMDQ_ERR_CERROR_NONE_IDX]	= "No error",
+		[CMDQ_ERR_CERROR_ILL_IDX]	= "Illegal command",
+		[CMDQ_ERR_CERROR_ABT_IDX]	= "Abort on command fetch",
+	};
+
+	int i;
+	u64 cmd[CMDQ_ENT_DWORDS];
+	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	u32 cons = readl_relaxed(q->cons_reg);
+	u32 idx = cons >> CMDQ_ERR_SHIFT & CMDQ_ERR_MASK;
+	struct arm_smmu_cmdq_ent cmd_sync = {
+		.opcode = CMDQ_OP_CMD_SYNC,
+	};
+
+	dev_err(smmu->dev, "CMDQ error (cons 0x%08x): %s\n", cons,
+		cerror_str[idx]);
+
+	switch (idx) {
+	case CMDQ_ERR_CERROR_ILL_IDX:
+		break;
+	case CMDQ_ERR_CERROR_ABT_IDX:
+		dev_err(smmu->dev, "retrying command fetch\n");
+	case CMDQ_ERR_CERROR_NONE_IDX:
+		return;
+	}
+
+	/*
+	 * We may have concurrent producers, so we need to be careful
+	 * not to touch any of the shadow cmdq state.
+	 */
+	queue_read(cmd, Q_ENT(q, idx), q->ent_dwords);
+	dev_err(smmu->dev, "skipping command in error state:\n");
+	for (i = 0; i < ARRAY_SIZE(cmd); ++i)
+		dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]);
+
+	/* Convert the erroneous command into a CMD_SYNC */
+	if (arm_smmu_cmdq_build_cmd(cmd, &cmd_sync)) {
+		dev_err(smmu->dev, "failed to convert to CMD_SYNC\n");
+		return;
+	}
+
+	queue_write(cmd, Q_ENT(q, idx), q->ent_dwords);
+}
+
+static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+				    struct arm_smmu_cmdq_ent *ent)
+{
+	u32 until;
+	u64 cmd[CMDQ_ENT_DWORDS];
+	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	struct arm_smmu_queue *q = &smmu->cmdq.q;
+
+	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
+		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
+			 ent->opcode);
+		return;
+	}
+
+	spin_lock(&smmu->cmdq.lock);
+	while (until = q->prod + 1, queue_insert_raw(q, cmd) == -ENOSPC) {
+		/*
+		 * Keep the queue locked, otherwise the producer could wrap
+		 * twice and we could see a future consumer pointer that looks
+		 * like it's behind us.
+		 */
+		if (queue_poll_cons(q, until, wfe))
+			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+	}
+
+	if (ent->opcode == CMDQ_OP_CMD_SYNC && queue_poll_cons(q, until, wfe))
+		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
+	spin_unlock(&smmu->cmdq.lock);
+}
+
+/* Context descriptor manipulation functions */
+static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
+{
+	u64 val = 0;
+
+	/* Repack the TCR. Just care about TTBR0 for now */
+	val |= ARM_SMMU_TCR2CD(tcr, T0SZ);
+	val |= ARM_SMMU_TCR2CD(tcr, TG0);
+	val |= ARM_SMMU_TCR2CD(tcr, IRGN0);
+	val |= ARM_SMMU_TCR2CD(tcr, ORGN0);
+	val |= ARM_SMMU_TCR2CD(tcr, SH0);
+	val |= ARM_SMMU_TCR2CD(tcr, EPD0);
+	val |= ARM_SMMU_TCR2CD(tcr, EPD1);
+	val |= ARM_SMMU_TCR2CD(tcr, IPS);
+	val |= ARM_SMMU_TCR2CD(tcr, TBI0);
+
+	return val;
+}
+
+static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu,
+				    struct arm_smmu_s1_cfg *cfg)
+{
+	u64 val;
+
+	/*
+	 * We don't need to issue any invalidation here, as we'll invalidate
+	 * the STE when installing the new entry anyway.
+	 */
+	val = arm_smmu_cpu_tcr_to_cd(cfg->cd.tcr) |
+#ifdef __BIG_ENDIAN
+	      CTXDESC_CD_0_ENDI |
+#endif
+	      CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET_PRIVATE |
+	      CTXDESC_CD_0_AA64 | (u64)cfg->cd.asid << CTXDESC_CD_0_ASID_SHIFT |
+	      CTXDESC_CD_0_V;
+	cfg->cdptr[0] = cpu_to_le64(val);
+
+	val = cfg->cd.ttbr & CTXDESC_CD_1_TTB0_MASK << CTXDESC_CD_1_TTB0_SHIFT;
+	cfg->cdptr[1] = cpu_to_le64(val);
+
+	cfg->cdptr[3] = cpu_to_le64(cfg->cd.mair << CTXDESC_CD_3_MAIR_SHIFT);
+}
+
+/* Stream table manipulation functions */
+static void
+arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
+{
+	u64 val = 0;
+
+	val |= (desc->span & STRTAB_L1_DESC_SPAN_MASK)
+		<< STRTAB_L1_DESC_SPAN_SHIFT;
+	val |= desc->l2ptr_dma &
+	       STRTAB_L1_DESC_L2PTR_MASK << STRTAB_L1_DESC_L2PTR_SHIFT;
+
+	*dst = cpu_to_le64(val);
+}
+
+static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
+{
+	struct arm_smmu_cmdq_ent cmd = {
+		.opcode	= CMDQ_OP_CFGI_STE,
+		.cfgi	= {
+			.sid	= sid,
+			.leaf	= true,
+		},
+	};
+
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	cmd.opcode = CMDQ_OP_CMD_SYNC;
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+}
+
+static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
+				      __le64 *dst, struct arm_smmu_strtab_ent *ste)
+{
+	/*
+	 * This is hideously complicated, but we only really care about
+	 * three cases at the moment:
+	 *
+	 * 1. Invalid (all zero) -> bypass  (init)
+	 * 2. Bypass -> translation (attach)
+	 * 3. Translation -> bypass (detach)
+	 *
+	 * Given that we can't update the STE atomically and the SMMU
+	 * doesn't read the thing in a defined order, that leaves us
+	 * with the following maintenance requirements:
+	 *
+	 * 1. Update Config, return (init time STEs aren't live)
+	 * 2. Write everything apart from dword 0, sync, write dword 0, sync
+	 * 3. Update Config, sync
+	 */
+	u64 val = le64_to_cpu(dst[0]);
+	bool ste_live = false;
+	struct arm_smmu_cmdq_ent prefetch_cmd = {
+		.opcode		= CMDQ_OP_PREFETCH_CFG,
+		.prefetch	= {
+			.sid	= sid,
+		},
+	};
+
+	if (val & STRTAB_STE_0_V) {
+		u64 cfg;
+
+		cfg = val & STRTAB_STE_0_CFG_MASK << STRTAB_STE_0_CFG_SHIFT;
+		switch (cfg) {
+		case STRTAB_STE_0_CFG_BYPASS:
+			break;
+		case STRTAB_STE_0_CFG_S1_TRANS:
+		case STRTAB_STE_0_CFG_S2_TRANS:
+			ste_live = true;
+			break;
+		default:
+			BUG(); /* STE corruption */
+		}
+	}
+
+	/* Nuke the existing Config, as we're going to rewrite it */
+	val &= ~(STRTAB_STE_0_CFG_MASK << STRTAB_STE_0_CFG_SHIFT);
+
+	if (ste->valid)
+		val |= STRTAB_STE_0_V;
+	else
+		val &= ~STRTAB_STE_0_V;
+
+	if (ste->bypass) {
+		val |= disable_bypass ? STRTAB_STE_0_CFG_ABORT
+				      : STRTAB_STE_0_CFG_BYPASS;
+		dst[0] = cpu_to_le64(val);
+		dst[2] = 0; /* Nuke the VMID */
+		if (ste_live)
+			arm_smmu_sync_ste_for_sid(smmu, sid);
+		return;
+	}
+
+	if (ste->s1_cfg) {
+		BUG_ON(ste_live);
+		dst[1] = cpu_to_le64(
+			 STRTAB_STE_1_S1C_CACHE_WBRA
+			 << STRTAB_STE_1_S1CIR_SHIFT |
+			 STRTAB_STE_1_S1C_CACHE_WBRA
+			 << STRTAB_STE_1_S1COR_SHIFT |
+			 STRTAB_STE_1_S1C_SH_ISH << STRTAB_STE_1_S1CSH_SHIFT |
+			 STRTAB_STE_1_S1STALLD |
+#ifdef CONFIG_PCI_ATS
+			 STRTAB_STE_1_EATS_TRANS << STRTAB_STE_1_EATS_SHIFT |
+#endif
+			 STRTAB_STE_1_STRW_NSEL1 << STRTAB_STE_1_STRW_SHIFT);
+
+		val |= (ste->s1_cfg->cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK
+		        << STRTAB_STE_0_S1CTXPTR_SHIFT) |
+			STRTAB_STE_0_CFG_S1_TRANS;
+
+	}
+
+	if (ste->s2_cfg) {
+		BUG_ON(ste_live);
+		dst[2] = cpu_to_le64(
+			 ste->s2_cfg->vmid << STRTAB_STE_2_S2VMID_SHIFT |
+			 (ste->s2_cfg->vtcr & STRTAB_STE_2_VTCR_MASK)
+			  << STRTAB_STE_2_VTCR_SHIFT |
+#ifdef __BIG_ENDIAN
+			 STRTAB_STE_2_S2ENDI |
+#endif
+			 STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 |
+			 STRTAB_STE_2_S2R);
+
+		dst[3] = cpu_to_le64(ste->s2_cfg->vttbr &
+			 STRTAB_STE_3_S2TTB_MASK << STRTAB_STE_3_S2TTB_SHIFT);
+
+		val |= STRTAB_STE_0_CFG_S2_TRANS;
+	}
+
+	arm_smmu_sync_ste_for_sid(smmu, sid);
+	dst[0] = cpu_to_le64(val);
+	arm_smmu_sync_ste_for_sid(smmu, sid);
+
+	/* It's likely that we'll want to use the new STE soon */
+	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH))
+		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+}
+
+static void arm_smmu_init_bypass_stes(u64 *strtab, unsigned int nent)
+{
+	unsigned int i;
+	struct arm_smmu_strtab_ent ste = {
+		.valid	= true,
+		.bypass	= true,
+	};
+
+	for (i = 0; i < nent; ++i) {
+		arm_smmu_write_strtab_ent(NULL, -1, strtab, &ste);
+		strtab += STRTAB_STE_DWORDS;
+	}
+}
+
+static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
+{
+	size_t size;
+	void *strtab;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[sid >> STRTAB_SPLIT];
+
+	if (desc->l2ptr)
+		return 0;
+
+	size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3);
+	strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS];
+
+	desc->span = STRTAB_SPLIT + 1;
+	desc->l2ptr = dma_zalloc_coherent(smmu->dev, size, &desc->l2ptr_dma,
+					  GFP_KERNEL);
+	if (!desc->l2ptr) {
+		dev_err(smmu->dev,
+			"failed to allocate l2 stream table for SID %u\n",
+			sid);
+		return -ENOMEM;
+	}
+
+	arm_smmu_init_bypass_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
+	arm_smmu_write_strtab_l1_desc(strtab, desc);
+	return 0;
+}
+
+/* IRQ and event handlers */
+static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
+{
+	int i;
+	struct arm_smmu_device *smmu = dev;
+	struct arm_smmu_queue *q = &smmu->evtq.q;
+	u64 evt[EVTQ_ENT_DWORDS];
+
+	while (!queue_remove_raw(q, evt)) {
+		u8 id = evt[0] >> EVTQ_0_ID_SHIFT & EVTQ_0_ID_MASK;
+
+		dev_info(smmu->dev, "event 0x%02x received:\n", id);
+		for (i = 0; i < ARRAY_SIZE(evt); ++i)
+			dev_info(smmu->dev, "\t0x%016llx\n",
+				 (unsigned long long)evt[i]);
+	}
+
+	/* Sync our overflow flag, as we believe we're up to speed */
+	q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t arm_smmu_evtq_handler(int irq, void *dev)
+{
+	irqreturn_t ret = IRQ_WAKE_THREAD;
+	struct arm_smmu_device *smmu = dev;
+	struct arm_smmu_queue *q = &smmu->evtq.q;
+
+	/*
+	 * Not much we can do on overflow, so scream and pretend we're
+	 * trying harder.
+	 */
+	if (queue_sync_prod(q) == -EOVERFLOW)
+		dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n");
+	else if (queue_empty(q))
+		ret = IRQ_NONE;
+
+	return ret;
+}
+
+static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
+{
+	struct arm_smmu_device *smmu = dev;
+	struct arm_smmu_queue *q = &smmu->priq.q;
+	u64 evt[PRIQ_ENT_DWORDS];
+
+	while (!queue_remove_raw(q, evt)) {
+		u32 sid, ssid;
+		u16 grpid;
+		bool ssv, last;
+
+		sid = evt[0] >> PRIQ_0_SID_SHIFT & PRIQ_0_SID_MASK;
+		ssv = evt[0] & PRIQ_0_SSID_V;
+		ssid = ssv ? evt[0] >> PRIQ_0_SSID_SHIFT & PRIQ_0_SSID_MASK : 0;
+		last = evt[0] & PRIQ_0_PRG_LAST;
+		grpid = evt[1] >> PRIQ_1_PRG_IDX_SHIFT & PRIQ_1_PRG_IDX_MASK;
+
+		dev_info(smmu->dev, "unexpected PRI request received:\n");
+		dev_info(smmu->dev,
+			 "\tsid 0x%08x.0x%05x: [%u%s] %sprivileged %s%s%s access at iova 0x%016llx\n",
+			 sid, ssid, grpid, last ? "L" : "",
+			 evt[0] & PRIQ_0_PERM_PRIV ? "" : "un",
+			 evt[0] & PRIQ_0_PERM_READ ? "R" : "",
+			 evt[0] & PRIQ_0_PERM_WRITE ? "W" : "",
+			 evt[0] & PRIQ_0_PERM_EXEC ? "X" : "",
+			 evt[1] & PRIQ_1_ADDR_MASK << PRIQ_1_ADDR_SHIFT);
+
+		if (last) {
+			struct arm_smmu_cmdq_ent cmd = {
+				.opcode			= CMDQ_OP_PRI_RESP,
+				.substream_valid	= ssv,
+				.pri			= {
+					.sid	= sid,
+					.ssid	= ssid,
+					.grpid	= grpid,
+					.resp	= PRI_RESP_DENY,
+				},
+			};
+
+			arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+		}
+	}
+
+	/* Sync our overflow flag, as we believe we're up to speed */
+	q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t arm_smmu_priq_handler(int irq, void *dev)
+{
+	irqreturn_t ret = IRQ_WAKE_THREAD;
+	struct arm_smmu_device *smmu = dev;
+	struct arm_smmu_queue *q = &smmu->priq.q;
+
+	/* PRIQ overflow indicates a programming error */
+	if (queue_sync_prod(q) == -EOVERFLOW)
+		dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
+	else if (queue_empty(q))
+		ret = IRQ_NONE;
+
+	return ret;
+}
+
+static irqreturn_t arm_smmu_cmdq_sync_handler(int irq, void *dev)
+{
+	/* We don't actually use CMD_SYNC interrupts for anything */
+	return IRQ_HANDLED;
+}
+
+static int arm_smmu_device_disable(struct arm_smmu_device *smmu);
+
+static irqreturn_t arm_smmu_gerror_handler(int irq, void *dev)
+{
+	u32 gerror, gerrorn;
+	struct arm_smmu_device *smmu = dev;
+
+	gerror = readl_relaxed(smmu->base + ARM_SMMU_GERROR);
+	gerrorn = readl_relaxed(smmu->base + ARM_SMMU_GERRORN);
+
+	gerror ^= gerrorn;
+	if (!(gerror & GERROR_ERR_MASK))
+		return IRQ_NONE; /* No errors pending */
+
+	dev_warn(smmu->dev,
+		 "unexpected global error reported (0x%08x), this could be serious\n",
+		 gerror);
+
+	if (gerror & GERROR_SFM_ERR) {
+		dev_err(smmu->dev, "device has entered Service Failure Mode!\n");
+		arm_smmu_device_disable(smmu);
+	}
+
+	if (gerror & GERROR_MSI_GERROR_ABT_ERR)
+		dev_warn(smmu->dev, "GERROR MSI write aborted\n");
+
+	if (gerror & GERROR_MSI_PRIQ_ABT_ERR) {
+		dev_warn(smmu->dev, "PRIQ MSI write aborted\n");
+		arm_smmu_priq_handler(irq, smmu->dev);
+	}
+
+	if (gerror & GERROR_MSI_EVTQ_ABT_ERR) {
+		dev_warn(smmu->dev, "EVTQ MSI write aborted\n");
+		arm_smmu_evtq_handler(irq, smmu->dev);
+	}
+
+	if (gerror & GERROR_MSI_CMDQ_ABT_ERR) {
+		dev_warn(smmu->dev, "CMDQ MSI write aborted\n");
+		arm_smmu_cmdq_sync_handler(irq, smmu->dev);
+	}
+
+	if (gerror & GERROR_PRIQ_ABT_ERR)
+		dev_err(smmu->dev, "PRIQ write aborted -- events may have been lost\n");
+
+	if (gerror & GERROR_EVTQ_ABT_ERR)
+		dev_err(smmu->dev, "EVTQ write aborted -- events may have been lost\n");
+
+	if (gerror & GERROR_CMDQ_ERR)
+		arm_smmu_cmdq_skip_err(smmu);
+
+	writel(gerror, smmu->base + ARM_SMMU_GERRORN);
+	return IRQ_HANDLED;
+}
+
+/* IO_PGTABLE API */
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+{
+	struct arm_smmu_cmdq_ent cmd;
+
+	cmd.opcode = CMDQ_OP_CMD_SYNC;
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+}
+
+static void arm_smmu_tlb_sync(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	__arm_smmu_tlb_sync(smmu_domain->smmu);
+}
+
+static void arm_smmu_tlb_inv_context(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_cmdq_ent cmd;
+
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		cmd.opcode	= CMDQ_OP_TLBI_NH_ASID;
+		cmd.tlbi.asid	= smmu_domain->s1_cfg.cd.asid;
+		cmd.tlbi.vmid	= 0;
+	} else {
+		cmd.opcode	= CMDQ_OP_TLBI_S12_VMALL;
+		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
+	}
+
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	__arm_smmu_tlb_sync(smmu);
+}
+
+static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
+					  bool leaf, void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_cmdq_ent cmd = {
+		.tlbi = {
+			.leaf	= leaf,
+			.addr	= iova,
+		},
+	};
+
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		cmd.opcode	= CMDQ_OP_TLBI_NH_VA;
+		cmd.tlbi.asid	= smmu_domain->s1_cfg.cd.asid;
+	} else {
+		cmd.opcode	= CMDQ_OP_TLBI_S2_IPA;
+		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
+	}
+
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+}
+
+static struct iommu_gather_ops arm_smmu_gather_ops = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context,
+	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync,
+};
+
+/* IOMMU API */
+static bool arm_smmu_capable(enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return true;
+	case IOMMU_CAP_INTR_REMAP:
+		return true; /* MSIs are just memory writes */
+	case IOMMU_CAP_NOEXEC:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
+{
+	struct arm_smmu_domain *smmu_domain;
+
+	if (type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	/*
+	 * Allocate the domain and initialise some of its data structures.
+	 * We can't really do anything meaningful until we've added a
+	 * master.
+	 */
+	smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
+	if (!smmu_domain)
+		return NULL;
+
+	mutex_init(&smmu_domain->init_mutex);
+	spin_lock_init(&smmu_domain->pgtbl_lock);
+	return &smmu_domain->domain;
+}
+
+static int arm_smmu_bitmap_alloc(unsigned long *map, int span)
+{
+	int idx, size = 1 << span;
+
+	do {
+		idx = find_first_zero_bit(map, size);
+		if (idx == size)
+			return -ENOSPC;
+	} while (test_and_set_bit(idx, map));
+
+	return idx;
+}
+
+static void arm_smmu_bitmap_free(unsigned long *map, int idx)
+{
+	clear_bit(idx, map);
+}
+
+static void arm_smmu_domain_free(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+	free_io_pgtable_ops(smmu_domain->pgtbl_ops);
+
+	/* Free the CD and ASID, if we allocated them */
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+
+		if (cfg->cdptr) {
+			dma_free_coherent(smmu_domain->smmu->dev,
+					  CTXDESC_CD_DWORDS << 3,
+					  cfg->cdptr,
+					  cfg->cdptr_dma);
+
+			arm_smmu_bitmap_free(smmu->asid_map, cfg->cd.asid);
+		}
+	} else {
+		struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
+		if (cfg->vmid)
+			arm_smmu_bitmap_free(smmu->vmid_map, cfg->vmid);
+	}
+
+	kfree(smmu_domain);
+}
+
+static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
+				       struct io_pgtable_cfg *pgtbl_cfg)
+{
+	int ret;
+	int asid;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
+
+	asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits);
+	if (IS_ERR_VALUE(asid))
+		return asid;
+
+	cfg->cdptr = dma_zalloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3,
+					 &cfg->cdptr_dma, GFP_KERNEL);
+	if (!cfg->cdptr) {
+		dev_warn(smmu->dev, "failed to allocate context descriptor\n");
+		ret = -ENOMEM;
+		goto out_free_asid;
+	}
+
+	cfg->cd.asid	= (u16)asid;
+	cfg->cd.ttbr	= pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
+	cfg->cd.tcr	= pgtbl_cfg->arm_lpae_s1_cfg.tcr;
+	cfg->cd.mair	= pgtbl_cfg->arm_lpae_s1_cfg.mair[0];
+	return 0;
+
+out_free_asid:
+	arm_smmu_bitmap_free(smmu->asid_map, asid);
+	return ret;
+}
+
+static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
+				       struct io_pgtable_cfg *pgtbl_cfg)
+{
+	int vmid;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
+
+	vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);
+	if (IS_ERR_VALUE(vmid))
+		return vmid;
+
+	cfg->vmid	= (u16)vmid;
+	cfg->vttbr	= pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
+	cfg->vtcr	= pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
+	return 0;
+}
+
+static struct iommu_ops arm_smmu_ops;
+
+static int arm_smmu_domain_finalise(struct iommu_domain *domain)
+{
+	int ret;
+	unsigned long ias, oas;
+	enum io_pgtable_fmt fmt;
+	struct io_pgtable_cfg pgtbl_cfg;
+	struct io_pgtable_ops *pgtbl_ops;
+	int (*finalise_stage_fn)(struct arm_smmu_domain *,
+				 struct io_pgtable_cfg *);
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+	/* Restrict the stage to what we can actually support */
+	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+	switch (smmu_domain->stage) {
+	case ARM_SMMU_DOMAIN_S1:
+		ias = VA_BITS;
+		oas = smmu->ias;
+		fmt = ARM_64_LPAE_S1;
+		finalise_stage_fn = arm_smmu_domain_finalise_s1;
+		break;
+	case ARM_SMMU_DOMAIN_NESTED:
+	case ARM_SMMU_DOMAIN_S2:
+		ias = smmu->ias;
+		oas = smmu->oas;
+		fmt = ARM_64_LPAE_S2;
+		finalise_stage_fn = arm_smmu_domain_finalise_s2;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pgtbl_cfg = (struct io_pgtable_cfg) {
+		.pgsize_bitmap	= arm_smmu_ops.pgsize_bitmap,
+		.ias		= ias,
+		.oas		= oas,
+		.tlb		= &arm_smmu_gather_ops,
+		.iommu_dev	= smmu->dev,
+	};
+
+	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
+	if (!pgtbl_ops)
+		return -ENOMEM;
+
+	arm_smmu_ops.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+	smmu_domain->pgtbl_ops = pgtbl_ops;
+
+	ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
+	if (IS_ERR_VALUE(ret))
+		free_io_pgtable_ops(pgtbl_ops);
+
+	return ret;
+}
+
+static struct arm_smmu_group *arm_smmu_group_get(struct device *dev)
+{
+	struct iommu_group *group;
+	struct arm_smmu_group *smmu_group;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return NULL;
+
+	smmu_group = iommu_group_get_iommudata(group);
+	iommu_group_put(group);
+	return smmu_group;
+}
+
+static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
+{
+	__le64 *step;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+		struct arm_smmu_strtab_l1_desc *l1_desc;
+		int idx;
+
+		/* Two-level walk */
+		idx = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS;
+		l1_desc = &cfg->l1_desc[idx];
+		idx = (sid & ((1 << STRTAB_SPLIT) - 1)) * STRTAB_STE_DWORDS;
+		step = &l1_desc->l2ptr[idx];
+	} else {
+		/* Simple linear lookup */
+		step = &cfg->strtab[sid * STRTAB_STE_DWORDS];
+	}
+
+	return step;
+}
+
+static int arm_smmu_install_ste_for_group(struct arm_smmu_group *smmu_group)
+{
+	int i;
+	struct arm_smmu_domain *smmu_domain = smmu_group->domain;
+	struct arm_smmu_strtab_ent *ste = &smmu_group->ste;
+	struct arm_smmu_device *smmu = smmu_group->smmu;
+
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		ste->s1_cfg = &smmu_domain->s1_cfg;
+		ste->s2_cfg = NULL;
+		arm_smmu_write_ctx_desc(smmu, ste->s1_cfg);
+	} else {
+		ste->s1_cfg = NULL;
+		ste->s2_cfg = &smmu_domain->s2_cfg;
+	}
+
+	for (i = 0; i < smmu_group->num_sids; ++i) {
+		u32 sid = smmu_group->sids[i];
+		__le64 *step = arm_smmu_get_step_for_sid(smmu, sid);
+
+		arm_smmu_write_strtab_ent(smmu, sid, step, ste);
+	}
+
+	return 0;
+}
+
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	int ret = 0;
+	struct arm_smmu_device *smmu;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_group *smmu_group = arm_smmu_group_get(dev);
+
+	if (!smmu_group)
+		return -ENOENT;
+
+	/* Already attached to a different domain? */
+	if (smmu_group->domain && smmu_group->domain != smmu_domain)
+		return -EEXIST;
+
+	smmu = smmu_group->smmu;
+	mutex_lock(&smmu_domain->init_mutex);
+
+	if (!smmu_domain->smmu) {
+		smmu_domain->smmu = smmu;
+		ret = arm_smmu_domain_finalise(domain);
+		if (ret) {
+			smmu_domain->smmu = NULL;
+			goto out_unlock;
+		}
+	} else if (smmu_domain->smmu != smmu) {
+		dev_err(dev,
+			"cannot attach to SMMU %s (upstream of %s)\n",
+			dev_name(smmu_domain->smmu->dev),
+			dev_name(smmu->dev));
+		ret = -ENXIO;
+		goto out_unlock;
+	}
+
+	/* Group already attached to this domain? */
+	if (smmu_group->domain)
+		goto out_unlock;
+
+	smmu_group->domain	= smmu_domain;
+	smmu_group->ste.bypass	= false;
+
+	ret = arm_smmu_install_ste_for_group(smmu_group);
+	if (IS_ERR_VALUE(ret))
+		smmu_group->domain = NULL;
+
+out_unlock:
+	mutex_unlock(&smmu_domain->init_mutex);
+	return ret;
+}
+
+static void arm_smmu_detach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_group *smmu_group = arm_smmu_group_get(dev);
+
+	BUG_ON(!smmu_domain);
+	BUG_ON(!smmu_group);
+
+	mutex_lock(&smmu_domain->init_mutex);
+	BUG_ON(smmu_group->domain != smmu_domain);
+
+	smmu_group->ste.bypass = true;
+	if (IS_ERR_VALUE(arm_smmu_install_ste_for_group(smmu_group)))
+		dev_warn(dev, "failed to install bypass STE\n");
+
+	smmu_group->domain = NULL;
+	mutex_unlock(&smmu_domain->init_mutex);
+}
+
+static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
+			phys_addr_t paddr, size_t size, int prot)
+{
+	int ret;
+	unsigned long flags;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+	if (!ops)
+		return -ENODEV;
+
+	spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags);
+	ret = ops->map(ops, iova, paddr, size, prot);
+	spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags);
+	return ret;
+}
+
+static size_t
+arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
+{
+	size_t ret;
+	unsigned long flags;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+	if (!ops)
+		return 0;
+
+	spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags);
+	ret = ops->unmap(ops, iova, size);
+	spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags);
+	return ret;
+}
+
+static phys_addr_t
+arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+	phys_addr_t ret;
+	unsigned long flags;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+	if (!ops)
+		return 0;
+
+	spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags);
+	ret = ops->iova_to_phys(ops, iova);
+	spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags);
+
+	return ret;
+}
+
+static int __arm_smmu_get_pci_sid(struct pci_dev *pdev, u16 alias, void *sidp)
+{
+	*(u32 *)sidp = alias;
+	return 0; /* Continue walking */
+}
+
+static void __arm_smmu_release_pci_iommudata(void *data)
+{
+	kfree(data);
+}
+
+static struct arm_smmu_device *arm_smmu_get_for_pci_dev(struct pci_dev *pdev)
+{
+	struct device_node *of_node;
+	struct platform_device *smmu_pdev;
+	struct arm_smmu_device *smmu = NULL;
+	struct pci_bus *bus = pdev->bus;
+
+	/* Walk up to the root bus */
+	while (!pci_is_root_bus(bus))
+		bus = bus->parent;
+
+	/* Follow the "iommus" phandle from the host controller */
+	of_node = of_parse_phandle(bus->bridge->parent->of_node, "iommus", 0);
+	if (!of_node)
+		return NULL;
+
+	/* See if we can find an SMMU corresponding to the phandle */
+	smmu_pdev = of_find_device_by_node(of_node);
+	if (smmu_pdev)
+		smmu = platform_get_drvdata(smmu_pdev);
+
+	of_node_put(of_node);
+	return smmu;
+}
+
+static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
+{
+	unsigned long limit = smmu->strtab_cfg.num_l1_ents;
+
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
+		limit *= 1UL << STRTAB_SPLIT;
+
+	return sid < limit;
+}
+
+static int arm_smmu_add_device(struct device *dev)
+{
+	int i, ret;
+	u32 sid, *sids;
+	struct pci_dev *pdev;
+	struct iommu_group *group;
+	struct arm_smmu_group *smmu_group;
+	struct arm_smmu_device *smmu;
+
+	/* We only support PCI, for now */
+	if (!dev_is_pci(dev))
+		return -ENODEV;
+
+	pdev = to_pci_dev(dev);
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	smmu_group = iommu_group_get_iommudata(group);
+	if (!smmu_group) {
+		smmu = arm_smmu_get_for_pci_dev(pdev);
+		if (!smmu) {
+			ret = -ENOENT;
+			goto out_put_group;
+		}
+
+		smmu_group = kzalloc(sizeof(*smmu_group), GFP_KERNEL);
+		if (!smmu_group) {
+			ret = -ENOMEM;
+			goto out_put_group;
+		}
+
+		smmu_group->ste.valid	= true;
+		smmu_group->smmu	= smmu;
+		iommu_group_set_iommudata(group, smmu_group,
+					  __arm_smmu_release_pci_iommudata);
+	} else {
+		smmu = smmu_group->smmu;
+	}
+
+	/* Assume SID == RID until firmware tells us otherwise */
+	pci_for_each_dma_alias(pdev, __arm_smmu_get_pci_sid, &sid);
+	for (i = 0; i < smmu_group->num_sids; ++i) {
+		/* If we already know about this SID, then we're done */
+		if (smmu_group->sids[i] == sid)
+			return 0;
+	}
+
+	/* Check the SID is in range of the SMMU and our stream table */
+	if (!arm_smmu_sid_in_range(smmu, sid)) {
+		ret = -ERANGE;
+		goto out_put_group;
+	}
+
+	/* Ensure l2 strtab is initialised */
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+		ret = arm_smmu_init_l2_strtab(smmu, sid);
+		if (ret)
+			goto out_put_group;
+	}
+
+	/* Resize the SID array for the group */
+	smmu_group->num_sids++;
+	sids = krealloc(smmu_group->sids, smmu_group->num_sids * sizeof(*sids),
+			GFP_KERNEL);
+	if (!sids) {
+		smmu_group->num_sids--;
+		ret = -ENOMEM;
+		goto out_put_group;
+	}
+
+	/* Add the new SID */
+	sids[smmu_group->num_sids - 1] = sid;
+	smmu_group->sids = sids;
+	return 0;
+
+out_put_group:
+	iommu_group_put(group);
+	return ret;
+}
+
+static void arm_smmu_remove_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr, void *data)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
+		return 0;
+	default:
+		return -ENODEV;
+	}
+}
+
+static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr, void *data)
+{
+	int ret = 0;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	mutex_lock(&smmu_domain->init_mutex);
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		if (smmu_domain->smmu) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+
+		if (*(int *)data)
+			smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+		else
+			smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+		break;
+	default:
+		ret = -ENODEV;
+	}
+
+out_unlock:
+	mutex_unlock(&smmu_domain->init_mutex);
+	return ret;
+}
+
+static struct iommu_ops arm_smmu_ops = {
+	.capable		= arm_smmu_capable,
+	.domain_alloc		= arm_smmu_domain_alloc,
+	.domain_free		= arm_smmu_domain_free,
+	.attach_dev		= arm_smmu_attach_dev,
+	.detach_dev		= arm_smmu_detach_dev,
+	.map			= arm_smmu_map,
+	.unmap			= arm_smmu_unmap,
+	.iova_to_phys		= arm_smmu_iova_to_phys,
+	.add_device		= arm_smmu_add_device,
+	.remove_device		= arm_smmu_remove_device,
+	.domain_get_attr	= arm_smmu_domain_get_attr,
+	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
+};
+
+/* Probing and initialisation functions */
+static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
+				   struct arm_smmu_queue *q,
+				   unsigned long prod_off,
+				   unsigned long cons_off,
+				   size_t dwords)
+{
+	size_t qsz = ((1 << q->max_n_shift) * dwords) << 3;
+
+	q->base = dma_alloc_coherent(smmu->dev, qsz, &q->base_dma, GFP_KERNEL);
+	if (!q->base) {
+		dev_err(smmu->dev, "failed to allocate queue (0x%zx bytes)\n",
+			qsz);
+		return -ENOMEM;
+	}
+
+	q->prod_reg	= smmu->base + prod_off;
+	q->cons_reg	= smmu->base + cons_off;
+	q->ent_dwords	= dwords;
+
+	q->q_base  = Q_BASE_RWA;
+	q->q_base |= q->base_dma & Q_BASE_ADDR_MASK << Q_BASE_ADDR_SHIFT;
+	q->q_base |= (q->max_n_shift & Q_BASE_LOG2SIZE_MASK)
+		     << Q_BASE_LOG2SIZE_SHIFT;
+
+	q->prod = q->cons = 0;
+	return 0;
+}
+
+static void arm_smmu_free_one_queue(struct arm_smmu_device *smmu,
+				    struct arm_smmu_queue *q)
+{
+	size_t qsz = ((1 << q->max_n_shift) * q->ent_dwords) << 3;
+
+	dma_free_coherent(smmu->dev, qsz, q->base, q->base_dma);
+}
+
+static void arm_smmu_free_queues(struct arm_smmu_device *smmu)
+{
+	arm_smmu_free_one_queue(smmu, &smmu->cmdq.q);
+	arm_smmu_free_one_queue(smmu, &smmu->evtq.q);
+
+	if (smmu->features & ARM_SMMU_FEAT_PRI)
+		arm_smmu_free_one_queue(smmu, &smmu->priq.q);
+}
+
+static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
+{
+	int ret;
+
+	/* cmdq */
+	spin_lock_init(&smmu->cmdq.lock);
+	ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
+				      ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS);
+	if (ret)
+		goto out;
+
+	/* evtq */
+	ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
+				      ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS);
+	if (ret)
+		goto out_free_cmdq;
+
+	/* priq */
+	if (!(smmu->features & ARM_SMMU_FEAT_PRI))
+		return 0;
+
+	ret = arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
+				      ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS);
+	if (ret)
+		goto out_free_evtq;
+
+	return 0;
+
+out_free_evtq:
+	arm_smmu_free_one_queue(smmu, &smmu->evtq.q);
+out_free_cmdq:
+	arm_smmu_free_one_queue(smmu, &smmu->cmdq.q);
+out:
+	return ret;
+}
+
+static void arm_smmu_free_l2_strtab(struct arm_smmu_device *smmu)
+{
+	int i;
+	size_t size;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+	size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3);
+	for (i = 0; i < cfg->num_l1_ents; ++i) {
+		struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[i];
+
+		if (!desc->l2ptr)
+			continue;
+
+		dma_free_coherent(smmu->dev, size, desc->l2ptr,
+				  desc->l2ptr_dma);
+	}
+}
+
+static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu)
+{
+	unsigned int i;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	size_t size = sizeof(*cfg->l1_desc) * cfg->num_l1_ents;
+	void *strtab = smmu->strtab_cfg.strtab;
+
+	cfg->l1_desc = devm_kzalloc(smmu->dev, size, GFP_KERNEL);
+	if (!cfg->l1_desc) {
+		dev_err(smmu->dev, "failed to allocate l1 stream table desc\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < cfg->num_l1_ents; ++i) {
+		arm_smmu_write_strtab_l1_desc(strtab, &cfg->l1_desc[i]);
+		strtab += STRTAB_L1_DESC_DWORDS << 3;
+	}
+
+	return 0;
+}
+
+static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
+{
+	void *strtab;
+	u64 reg;
+	u32 size, l1size;
+	int ret;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+	/*
+	 * If we can resolve everything with a single L2 table, then we
+	 * just need a single L1 descriptor. Otherwise, calculate the L1
+	 * size, capped to the SIDSIZE.
+	 */
+	if (smmu->sid_bits < STRTAB_SPLIT) {
+		size = 0;
+	} else {
+		size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
+		size = min(size, smmu->sid_bits - STRTAB_SPLIT);
+	}
+	cfg->num_l1_ents = 1 << size;
+
+	size += STRTAB_SPLIT;
+	if (size < smmu->sid_bits)
+		dev_warn(smmu->dev,
+			 "2-level strtab only covers %u/%u bits of SID\n",
+			 size, smmu->sid_bits);
+
+	l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3);
+	strtab = dma_zalloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
+				     GFP_KERNEL);
+	if (!strtab) {
+		dev_err(smmu->dev,
+			"failed to allocate l1 stream table (%u bytes)\n",
+			size);
+		return -ENOMEM;
+	}
+	cfg->strtab = strtab;
+
+	/* Configure strtab_base_cfg for 2 levels */
+	reg  = STRTAB_BASE_CFG_FMT_2LVL;
+	reg |= (size & STRTAB_BASE_CFG_LOG2SIZE_MASK)
+		<< STRTAB_BASE_CFG_LOG2SIZE_SHIFT;
+	reg |= (STRTAB_SPLIT & STRTAB_BASE_CFG_SPLIT_MASK)
+		<< STRTAB_BASE_CFG_SPLIT_SHIFT;
+	cfg->strtab_base_cfg = reg;
+
+	ret = arm_smmu_init_l1_strtab(smmu);
+	if (ret)
+		dma_free_coherent(smmu->dev,
+				  l1size,
+				  strtab,
+				  cfg->strtab_dma);
+	return ret;
+}
+
+static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
+{
+	void *strtab;
+	u64 reg;
+	u32 size;
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+
+	size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3);
+	strtab = dma_zalloc_coherent(smmu->dev, size, &cfg->strtab_dma,
+				     GFP_KERNEL);
+	if (!strtab) {
+		dev_err(smmu->dev,
+			"failed to allocate linear stream table (%u bytes)\n",
+			size);
+		return -ENOMEM;
+	}
+	cfg->strtab = strtab;
+	cfg->num_l1_ents = 1 << smmu->sid_bits;
+
+	/* Configure strtab_base_cfg for a linear table covering all SIDs */
+	reg  = STRTAB_BASE_CFG_FMT_LINEAR;
+	reg |= (smmu->sid_bits & STRTAB_BASE_CFG_LOG2SIZE_MASK)
+		<< STRTAB_BASE_CFG_LOG2SIZE_SHIFT;
+	cfg->strtab_base_cfg = reg;
+
+	arm_smmu_init_bypass_stes(strtab, cfg->num_l1_ents);
+	return 0;
+}
+
+static int arm_smmu_init_strtab(struct arm_smmu_device *smmu)
+{
+	u64 reg;
+	int ret;
+
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
+		ret = arm_smmu_init_strtab_2lvl(smmu);
+	else
+		ret = arm_smmu_init_strtab_linear(smmu);
+
+	if (ret)
+		return ret;
+
+	/* Set the strtab base address */
+	reg  = smmu->strtab_cfg.strtab_dma &
+	       STRTAB_BASE_ADDR_MASK << STRTAB_BASE_ADDR_SHIFT;
+	reg |= STRTAB_BASE_RA;
+	smmu->strtab_cfg.strtab_base = reg;
+
+	/* Allocate the first VMID for stage-2 bypass STEs */
+	set_bit(0, smmu->vmid_map);
+	return 0;
+}
+
+static void arm_smmu_free_strtab(struct arm_smmu_device *smmu)
+{
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	u32 size = cfg->num_l1_ents;
+
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+		arm_smmu_free_l2_strtab(smmu);
+		size *= STRTAB_L1_DESC_DWORDS << 3;
+	} else {
+		size *= STRTAB_STE_DWORDS * 3;
+	}
+
+	dma_free_coherent(smmu->dev, size, cfg->strtab, cfg->strtab_dma);
+}
+
+static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
+{
+	int ret;
+
+	ret = arm_smmu_init_queues(smmu);
+	if (ret)
+		return ret;
+
+	ret = arm_smmu_init_strtab(smmu);
+	if (ret)
+		goto out_free_queues;
+
+	return 0;
+
+out_free_queues:
+	arm_smmu_free_queues(smmu);
+	return ret;
+}
+
+static void arm_smmu_free_structures(struct arm_smmu_device *smmu)
+{
+	arm_smmu_free_strtab(smmu);
+	arm_smmu_free_queues(smmu);
+}
+
+static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val,
+				   unsigned int reg_off, unsigned int ack_off)
+{
+	u32 reg;
+
+	writel_relaxed(val, smmu->base + reg_off);
+	return readl_relaxed_poll_timeout(smmu->base + ack_off, reg, reg == val,
+					  1, ARM_SMMU_POLL_TIMEOUT_US);
+}
+
+static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
+{
+	int ret, irq;
+	u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN;
+
+	/* Disable IRQs first */
+	ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_IRQ_CTRL,
+				      ARM_SMMU_IRQ_CTRLACK);
+	if (ret) {
+		dev_err(smmu->dev, "failed to disable irqs\n");
+		return ret;
+	}
+
+	/* Clear the MSI address regs */
+	writeq_relaxed(0, smmu->base + ARM_SMMU_GERROR_IRQ_CFG0);
+	writeq_relaxed(0, smmu->base + ARM_SMMU_EVTQ_IRQ_CFG0);
+
+	/* Request wired interrupt lines */
+	irq = smmu->evtq.q.irq;
+	if (irq) {
+		ret = devm_request_threaded_irq(smmu->dev, irq,
+						arm_smmu_evtq_handler,
+						arm_smmu_evtq_thread,
+						0, "arm-smmu-v3-evtq", smmu);
+		if (IS_ERR_VALUE(ret))
+			dev_warn(smmu->dev, "failed to enable evtq irq\n");
+	}
+
+	irq = smmu->cmdq.q.irq;
+	if (irq) {
+		ret = devm_request_irq(smmu->dev, irq,
+				       arm_smmu_cmdq_sync_handler, 0,
+				       "arm-smmu-v3-cmdq-sync", smmu);
+		if (IS_ERR_VALUE(ret))
+			dev_warn(smmu->dev, "failed to enable cmdq-sync irq\n");
+	}
+
+	irq = smmu->gerr_irq;
+	if (irq) {
+		ret = devm_request_irq(smmu->dev, irq, arm_smmu_gerror_handler,
+				       0, "arm-smmu-v3-gerror", smmu);
+		if (IS_ERR_VALUE(ret))
+			dev_warn(smmu->dev, "failed to enable gerror irq\n");
+	}
+
+	if (smmu->features & ARM_SMMU_FEAT_PRI) {
+		writeq_relaxed(0, smmu->base + ARM_SMMU_PRIQ_IRQ_CFG0);
+
+		irq = smmu->priq.q.irq;
+		if (irq) {
+			ret = devm_request_threaded_irq(smmu->dev, irq,
+							arm_smmu_priq_handler,
+							arm_smmu_priq_thread,
+							0, "arm-smmu-v3-priq",
+							smmu);
+			if (IS_ERR_VALUE(ret))
+				dev_warn(smmu->dev,
+					 "failed to enable priq irq\n");
+			else
+				irqen_flags |= IRQ_CTRL_PRIQ_IRQEN;
+		}
+	}
+
+	/* Enable interrupt generation on the SMMU */
+	ret = arm_smmu_write_reg_sync(smmu, irqen_flags,
+				      ARM_SMMU_IRQ_CTRL, ARM_SMMU_IRQ_CTRLACK);
+	if (ret)
+		dev_warn(smmu->dev, "failed to enable irqs\n");
+
+	return 0;
+}
+
+static int arm_smmu_device_disable(struct arm_smmu_device *smmu)
+{
+	int ret;
+
+	ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_CR0, ARM_SMMU_CR0ACK);
+	if (ret)
+		dev_err(smmu->dev, "failed to clear cr0\n");
+
+	return ret;
+}
+
+static int arm_smmu_device_reset(struct arm_smmu_device *smmu)
+{
+	int ret;
+	u32 reg, enables;
+	struct arm_smmu_cmdq_ent cmd;
+
+	/* Clear CR0 and sync (disables SMMU and queue processing) */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_CR0);
+	if (reg & CR0_SMMUEN)
+		dev_warn(smmu->dev, "SMMU currently enabled! Resetting...\n");
+
+	ret = arm_smmu_device_disable(smmu);
+	if (ret)
+		return ret;
+
+	/* CR1 (table and queue memory attributes) */
+	reg = (CR1_SH_ISH << CR1_TABLE_SH_SHIFT) |
+	      (CR1_CACHE_WB << CR1_TABLE_OC_SHIFT) |
+	      (CR1_CACHE_WB << CR1_TABLE_IC_SHIFT) |
+	      (CR1_SH_ISH << CR1_QUEUE_SH_SHIFT) |
+	      (CR1_CACHE_WB << CR1_QUEUE_OC_SHIFT) |
+	      (CR1_CACHE_WB << CR1_QUEUE_IC_SHIFT);
+	writel_relaxed(reg, smmu->base + ARM_SMMU_CR1);
+
+	/* CR2 (random crap) */
+	reg = CR2_PTM | CR2_RECINVSID | CR2_E2H;
+	writel_relaxed(reg, smmu->base + ARM_SMMU_CR2);
+
+	/* Stream table */
+	writeq_relaxed(smmu->strtab_cfg.strtab_base,
+		       smmu->base + ARM_SMMU_STRTAB_BASE);
+	writel_relaxed(smmu->strtab_cfg.strtab_base_cfg,
+		       smmu->base + ARM_SMMU_STRTAB_BASE_CFG);
+
+	/* Command queue */
+	writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE);
+	writel_relaxed(smmu->cmdq.q.prod, smmu->base + ARM_SMMU_CMDQ_PROD);
+	writel_relaxed(smmu->cmdq.q.cons, smmu->base + ARM_SMMU_CMDQ_CONS);
+
+	enables = CR0_CMDQEN;
+	ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+				      ARM_SMMU_CR0ACK);
+	if (ret) {
+		dev_err(smmu->dev, "failed to enable command queue\n");
+		return ret;
+	}
+
+	/* Invalidate any cached configuration */
+	cmd.opcode = CMDQ_OP_CFGI_ALL;
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	cmd.opcode = CMDQ_OP_CMD_SYNC;
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+
+	/* Invalidate any stale TLB entries */
+	if (smmu->features & ARM_SMMU_FEAT_HYP) {
+		cmd.opcode = CMDQ_OP_TLBI_EL2_ALL;
+		arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	}
+
+	cmd.opcode = CMDQ_OP_TLBI_NSNH_ALL;
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	cmd.opcode = CMDQ_OP_CMD_SYNC;
+	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+
+	/* Event queue */
+	writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE);
+	writel_relaxed(smmu->evtq.q.prod, smmu->base + ARM_SMMU_EVTQ_PROD);
+	writel_relaxed(smmu->evtq.q.cons, smmu->base + ARM_SMMU_EVTQ_CONS);
+
+	enables |= CR0_EVTQEN;
+	ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+				      ARM_SMMU_CR0ACK);
+	if (ret) {
+		dev_err(smmu->dev, "failed to enable event queue\n");
+		return ret;
+	}
+
+	/* PRI queue */
+	if (smmu->features & ARM_SMMU_FEAT_PRI) {
+		writeq_relaxed(smmu->priq.q.q_base,
+			       smmu->base + ARM_SMMU_PRIQ_BASE);
+		writel_relaxed(smmu->priq.q.prod,
+			       smmu->base + ARM_SMMU_PRIQ_PROD);
+		writel_relaxed(smmu->priq.q.cons,
+			       smmu->base + ARM_SMMU_PRIQ_CONS);
+
+		enables |= CR0_PRIQEN;
+		ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+					      ARM_SMMU_CR0ACK);
+		if (ret) {
+			dev_err(smmu->dev, "failed to enable PRI queue\n");
+			return ret;
+		}
+	}
+
+	ret = arm_smmu_setup_irqs(smmu);
+	if (ret) {
+		dev_err(smmu->dev, "failed to setup irqs\n");
+		return ret;
+	}
+
+	/* Enable the SMMU interface */
+	enables |= CR0_SMMUEN;
+	ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+				      ARM_SMMU_CR0ACK);
+	if (ret) {
+		dev_err(smmu->dev, "failed to enable SMMU interface\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int arm_smmu_device_probe(struct arm_smmu_device *smmu)
+{
+	u32 reg;
+	bool coherent;
+	unsigned long pgsize_bitmap = 0;
+
+	/* IDR0 */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR0);
+
+	/* 2-level structures */
+	if ((reg & IDR0_ST_LVL_MASK << IDR0_ST_LVL_SHIFT) == IDR0_ST_LVL_2LVL)
+		smmu->features |= ARM_SMMU_FEAT_2_LVL_STRTAB;
+
+	if (reg & IDR0_CD2L)
+		smmu->features |= ARM_SMMU_FEAT_2_LVL_CDTAB;
+
+	/*
+	 * Translation table endianness.
+	 * We currently require the same endianness as the CPU, but this
+	 * could be changed later by adding a new IO_PGTABLE_QUIRK.
+	 */
+	switch (reg & IDR0_TTENDIAN_MASK << IDR0_TTENDIAN_SHIFT) {
+	case IDR0_TTENDIAN_MIXED:
+		smmu->features |= ARM_SMMU_FEAT_TT_LE | ARM_SMMU_FEAT_TT_BE;
+		break;
+#ifdef __BIG_ENDIAN
+	case IDR0_TTENDIAN_BE:
+		smmu->features |= ARM_SMMU_FEAT_TT_BE;
+		break;
+#else
+	case IDR0_TTENDIAN_LE:
+		smmu->features |= ARM_SMMU_FEAT_TT_LE;
+		break;
+#endif
+	default:
+		dev_err(smmu->dev, "unknown/unsupported TT endianness!\n");
+		return -ENXIO;
+	}
+
+	/* Boolean feature flags */
+	if (IS_ENABLED(CONFIG_PCI_PRI) && reg & IDR0_PRI)
+		smmu->features |= ARM_SMMU_FEAT_PRI;
+
+	if (IS_ENABLED(CONFIG_PCI_ATS) && reg & IDR0_ATS)
+		smmu->features |= ARM_SMMU_FEAT_ATS;
+
+	if (reg & IDR0_SEV)
+		smmu->features |= ARM_SMMU_FEAT_SEV;
+
+	if (reg & IDR0_MSI)
+		smmu->features |= ARM_SMMU_FEAT_MSI;
+
+	if (reg & IDR0_HYP)
+		smmu->features |= ARM_SMMU_FEAT_HYP;
+
+	/*
+	 * The dma-coherent property is used in preference to the ID
+	 * register, but warn on mismatch.
+	 */
+	coherent = of_dma_is_coherent(smmu->dev->of_node);
+	if (coherent)
+		smmu->features |= ARM_SMMU_FEAT_COHERENCY;
+
+	if (!!(reg & IDR0_COHACC) != coherent)
+		dev_warn(smmu->dev, "IDR0.COHACC overridden by dma-coherent property (%s)\n",
+			 coherent ? "true" : "false");
+
+	if (reg & IDR0_STALL_MODEL)
+		smmu->features |= ARM_SMMU_FEAT_STALLS;
+
+	if (reg & IDR0_S1P)
+		smmu->features |= ARM_SMMU_FEAT_TRANS_S1;
+
+	if (reg & IDR0_S2P)
+		smmu->features |= ARM_SMMU_FEAT_TRANS_S2;
+
+	if (!(reg & (IDR0_S1P | IDR0_S2P))) {
+		dev_err(smmu->dev, "no translation support!\n");
+		return -ENXIO;
+	}
+
+	/* We only support the AArch64 table format at present */
+	switch (reg & IDR0_TTF_MASK << IDR0_TTF_SHIFT) {
+	case IDR0_TTF_AARCH32_64:
+		smmu->ias = 40;
+		/* Fallthrough */
+	case IDR0_TTF_AARCH64:
+		break;
+	default:
+		dev_err(smmu->dev, "AArch64 table format not supported!\n");
+		return -ENXIO;
+	}
+
+	/* ASID/VMID sizes */
+	smmu->asid_bits = reg & IDR0_ASID16 ? 16 : 8;
+	smmu->vmid_bits = reg & IDR0_VMID16 ? 16 : 8;
+
+	/* IDR1 */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR1);
+	if (reg & (IDR1_TABLES_PRESET | IDR1_QUEUES_PRESET | IDR1_REL)) {
+		dev_err(smmu->dev, "embedded implementation not supported\n");
+		return -ENXIO;
+	}
+
+	/* Queue sizes, capped at 4k */
+	smmu->cmdq.q.max_n_shift = min((u32)CMDQ_MAX_SZ_SHIFT,
+				       reg >> IDR1_CMDQ_SHIFT & IDR1_CMDQ_MASK);
+	if (!smmu->cmdq.q.max_n_shift) {
+		/* Odd alignment restrictions on the base, so ignore for now */
+		dev_err(smmu->dev, "unit-length command queue not supported\n");
+		return -ENXIO;
+	}
+
+	smmu->evtq.q.max_n_shift = min((u32)EVTQ_MAX_SZ_SHIFT,
+				       reg >> IDR1_EVTQ_SHIFT & IDR1_EVTQ_MASK);
+	smmu->priq.q.max_n_shift = min((u32)PRIQ_MAX_SZ_SHIFT,
+				       reg >> IDR1_PRIQ_SHIFT & IDR1_PRIQ_MASK);
+
+	/* SID/SSID sizes */
+	smmu->ssid_bits = reg >> IDR1_SSID_SHIFT & IDR1_SSID_MASK;
+	smmu->sid_bits = reg >> IDR1_SID_SHIFT & IDR1_SID_MASK;
+
+	/* IDR5 */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
+
+	/* Maximum number of outstanding stalls */
+	smmu->evtq.max_stalls = reg >> IDR5_STALL_MAX_SHIFT
+				& IDR5_STALL_MAX_MASK;
+
+	/* Page sizes */
+	if (reg & IDR5_GRAN64K)
+		pgsize_bitmap |= SZ_64K | SZ_512M;
+	if (reg & IDR5_GRAN16K)
+		pgsize_bitmap |= SZ_16K | SZ_32M;
+	if (reg & IDR5_GRAN4K)
+		pgsize_bitmap |= SZ_4K | SZ_2M | SZ_1G;
+
+	arm_smmu_ops.pgsize_bitmap &= pgsize_bitmap;
+
+	/* Output address size */
+	switch (reg & IDR5_OAS_MASK << IDR5_OAS_SHIFT) {
+	case IDR5_OAS_32_BIT:
+		smmu->oas = 32;
+		break;
+	case IDR5_OAS_36_BIT:
+		smmu->oas = 36;
+		break;
+	case IDR5_OAS_40_BIT:
+		smmu->oas = 40;
+		break;
+	case IDR5_OAS_42_BIT:
+		smmu->oas = 42;
+		break;
+	case IDR5_OAS_44_BIT:
+		smmu->oas = 44;
+		break;
+	default:
+		dev_info(smmu->dev,
+			"unknown output address size. Truncating to 48-bit\n");
+		/* Fallthrough */
+	case IDR5_OAS_48_BIT:
+		smmu->oas = 48;
+	}
+
+	/* Set the DMA mask for our table walker */
+	if (dma_set_mask_and_coherent(smmu->dev, DMA_BIT_MASK(smmu->oas)))
+		dev_warn(smmu->dev,
+			 "failed to set DMA mask for table walker\n");
+
+	smmu->ias = max(smmu->ias, smmu->oas);
+
+	dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n",
+		 smmu->ias, smmu->oas, smmu->features);
+	return 0;
+}
+
+static int arm_smmu_device_dt_probe(struct platform_device *pdev)
+{
+	int irq, ret;
+	struct resource *res;
+	struct arm_smmu_device *smmu;
+	struct device *dev = &pdev->dev;
+
+	smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
+	if (!smmu) {
+		dev_err(dev, "failed to allocate arm_smmu_device\n");
+		return -ENOMEM;
+	}
+	smmu->dev = dev;
+
+	/* Base address */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (resource_size(res) + 1 < SZ_128K) {
+		dev_err(dev, "MMIO region too small (%pr)\n", res);
+		return -EINVAL;
+	}
+
+	smmu->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(smmu->base))
+		return PTR_ERR(smmu->base);
+
+	/* Interrupt lines */
+	irq = platform_get_irq_byname(pdev, "eventq");
+	if (irq > 0)
+		smmu->evtq.q.irq = irq;
+
+	irq = platform_get_irq_byname(pdev, "priq");
+	if (irq > 0)
+		smmu->priq.q.irq = irq;
+
+	irq = platform_get_irq_byname(pdev, "cmdq-sync");
+	if (irq > 0)
+		smmu->cmdq.q.irq = irq;
+
+	irq = platform_get_irq_byname(pdev, "gerror");
+	if (irq > 0)
+		smmu->gerr_irq = irq;
+
+	parse_driver_options(smmu);
+
+	/* Probe the h/w */
+	ret = arm_smmu_device_probe(smmu);
+	if (ret)
+		return ret;
+
+	/* Initialise in-memory data structures */
+	ret = arm_smmu_init_structures(smmu);
+	if (ret)
+		return ret;
+
+	/* Reset the device */
+	ret = arm_smmu_device_reset(smmu);
+	if (ret)
+		goto out_free_structures;
+
+	/* Record our private device structure */
+	platform_set_drvdata(pdev, smmu);
+	return 0;
+
+out_free_structures:
+	arm_smmu_free_structures(smmu);
+	return ret;
+}
+
+static int arm_smmu_device_remove(struct platform_device *pdev)
+{
+	struct arm_smmu_device *smmu = platform_get_drvdata(pdev);
+
+	arm_smmu_device_disable(smmu);
+	arm_smmu_free_structures(smmu);
+	return 0;
+}
+
+static struct of_device_id arm_smmu_of_match[] = {
+	{ .compatible = "arm,smmu-v3", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
+
+static struct platform_driver arm_smmu_driver = {
+	.driver	= {
+		.name		= "arm-smmu-v3",
+		.of_match_table	= of_match_ptr(arm_smmu_of_match),
+	},
+	.probe	= arm_smmu_device_dt_probe,
+	.remove	= arm_smmu_device_remove,
+};
+
+static int __init arm_smmu_init(void)
+{
+	struct device_node *np;
+	int ret;
+
+	np = of_find_matching_node(NULL, arm_smmu_of_match);
+	if (!np)
+		return 0;
+
+	of_node_put(np);
+
+	ret = platform_driver_register(&arm_smmu_driver);
+	if (ret)
+		return ret;
+
+	return bus_set_iommu(&pci_bus_type, &arm_smmu_ops);
+}
+
+static void __exit arm_smmu_exit(void)
+{
+	return platform_driver_unregister(&arm_smmu_driver);
+}
+
+subsys_initcall(arm_smmu_init);
+module_exit(arm_smmu_exit);
+
+MODULE_DESCRIPTION("IOMMU API for ARM architected SMMUv3 implementations");
+MODULE_AUTHOR("Will Deacon <will.deacon@arm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 65075ef75e2a..7c20a68b5a95 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -37,6 +37,7 @@
 #include <linux/iopoll.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -69,6 +70,18 @@
 		((smmu->options & ARM_SMMU_OPT_SECURE_CFG_ACCESS)	\
 			? 0x400 : 0))
 
+#ifdef CONFIG_64BIT
+#define smmu_writeq	writeq_relaxed
+#else
+#define smmu_writeq(reg64, addr)				\
+	do {							\
+		u64 __val = (reg64);				\
+		void __iomem *__addr = (addr);			\
+		writel_relaxed(__val >> 32, __addr + 4);	\
+		writel_relaxed(__val, __addr);			\
+	} while (0)
+#endif
+
 /* Configuration registers */
 #define ARM_SMMU_GR0_sCR0		0x0
 #define sCR0_CLIENTPD			(1 << 0)
@@ -184,10 +197,8 @@
 #define ARM_SMMU_CB_SCTLR		0x0
 #define ARM_SMMU_CB_RESUME		0x8
 #define ARM_SMMU_CB_TTBCR2		0x10
-#define ARM_SMMU_CB_TTBR0_LO		0x20
-#define ARM_SMMU_CB_TTBR0_HI		0x24
-#define ARM_SMMU_CB_TTBR1_LO		0x28
-#define ARM_SMMU_CB_TTBR1_HI		0x2c
+#define ARM_SMMU_CB_TTBR0		0x20
+#define ARM_SMMU_CB_TTBR1		0x28
 #define ARM_SMMU_CB_TTBCR		0x30
 #define ARM_SMMU_CB_S1_MAIR0		0x38
 #define ARM_SMMU_CB_S1_MAIR1		0x3c
@@ -202,8 +213,7 @@
 #define ARM_SMMU_CB_S1_TLBIVAL		0x620
 #define ARM_SMMU_CB_S2_TLBIIPAS2	0x630
 #define ARM_SMMU_CB_S2_TLBIIPAS2L	0x638
-#define ARM_SMMU_CB_ATS1PR_LO		0x800
-#define ARM_SMMU_CB_ATS1PR_HI		0x804
+#define ARM_SMMU_CB_ATS1PR		0x800
 #define ARM_SMMU_CB_ATSR		0x8f0
 
 #define SCTLR_S1_ASIDPNE		(1 << 12)
@@ -226,7 +236,7 @@
 #define TTBCR2_SEP_SHIFT		15
 #define TTBCR2_SEP_UPSTREAM		(0x7 << TTBCR2_SEP_SHIFT)
 
-#define TTBRn_HI_ASID_SHIFT            16
+#define TTBRn_ASID_SHIFT		48
 
 #define FSR_MULTI			(1 << 31)
 #define FSR_SS				(1 << 30)
@@ -247,7 +257,7 @@
 #define FSYNR0_WNR			(1 << 4)
 
 static int force_stage;
-module_param_named(force_stage, force_stage, int, S_IRUGO | S_IWUSR);
+module_param_named(force_stage, force_stage, int, S_IRUGO);
 MODULE_PARM_DESC(force_stage,
 	"Force SMMU mappings to be installed at a particular stage of translation. A value of '1' or '2' forces the corresponding stage. All other values are ignored (i.e. no stage is forced). Note that selecting a specific stage will disable support for nested translation.");
 
@@ -608,34 +618,10 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 	}
 }
 
-static void arm_smmu_flush_pgtable(void *addr, size_t size, void *cookie)
-{
-	struct arm_smmu_domain *smmu_domain = cookie;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	unsigned long offset = (unsigned long)addr & ~PAGE_MASK;
-
-
-	/* Ensure new page tables are visible to the hardware walker */
-	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) {
-		dsb(ishst);
-	} else {
-		/*
-		 * If the SMMU can't walk tables in the CPU caches, treat them
-		 * like non-coherent DMA since we need to flush the new entries
-		 * all the way out to memory. There's no possibility of
-		 * recursion here as the SMMU table walker will not be wired
-		 * through another SMMU.
-		 */
-		dma_map_page(smmu->dev, virt_to_page(addr), offset, size,
-			     DMA_TO_DEVICE);
-	}
-}
-
 static struct iommu_gather_ops arm_smmu_gather_ops = {
 	.tlb_flush_all	= arm_smmu_tlb_inv_context,
 	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
 	.tlb_sync	= arm_smmu_tlb_sync,
-	.flush_pgtable	= arm_smmu_flush_pgtable,
 };
 
 static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
@@ -719,12 +705,12 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 				       struct io_pgtable_cfg *pgtbl_cfg)
 {
 	u32 reg;
+	u64 reg64;
 	bool stage1;
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	void __iomem *cb_base, *gr0_base, *gr1_base;
+	void __iomem *cb_base, *gr1_base;
 
-	gr0_base = ARM_SMMU_GR0(smmu);
 	gr1_base = ARM_SMMU_GR1(smmu);
 	stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
 	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
@@ -762,22 +748,17 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 
 	/* TTBRs */
 	if (stage1) {
-		reg = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0_LO);
-		reg = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0] >> 32;
-		reg |= ARM_SMMU_CB_ASID(cfg) << TTBRn_HI_ASID_SHIFT;
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0_HI);
-
-		reg = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR1_LO);
-		reg = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1] >> 32;
-		reg |= ARM_SMMU_CB_ASID(cfg) << TTBRn_HI_ASID_SHIFT;
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR1_HI);
+		reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
+
+		reg64 |= ((u64)ARM_SMMU_CB_ASID(cfg)) << TTBRn_ASID_SHIFT;
+		smmu_writeq(reg64, cb_base + ARM_SMMU_CB_TTBR0);
+
+		reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
+		reg64 |= ((u64)ARM_SMMU_CB_ASID(cfg)) << TTBRn_ASID_SHIFT;
+		smmu_writeq(reg64, cb_base + ARM_SMMU_CB_TTBR1);
 	} else {
-		reg = pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0_LO);
-		reg = pgtbl_cfg->arm_lpae_s2_cfg.vttbr >> 32;
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0_HI);
+		reg64 = pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
+		smmu_writeq(reg64, cb_base + ARM_SMMU_CB_TTBR0);
 	}
 
 	/* TTBCR */
@@ -899,6 +880,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 		.ias		= ias,
 		.oas		= oas,
 		.tlb		= &arm_smmu_gather_ops,
+		.iommu_dev	= smmu->dev,
 	};
 
 	smmu_domain->smmu = smmu;
@@ -1229,23 +1211,21 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
 	void __iomem *cb_base;
 	u32 tmp;
 	u64 phys;
+	unsigned long va;
 
 	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
 
-	if (smmu->version == 1) {
-		u32 reg = iova & ~0xfff;
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_ATS1PR_LO);
-	} else {
-		u32 reg = iova & ~0xfff;
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_ATS1PR_LO);
-		reg = ((u64)iova & ~0xfff) >> 32;
-		writel_relaxed(reg, cb_base + ARM_SMMU_CB_ATS1PR_HI);
-	}
+	/* ATS1 registers can only be written atomically */
+	va = iova & ~0xfffUL;
+	if (smmu->version == ARM_SMMU_V2)
+		smmu_writeq(va, cb_base + ARM_SMMU_CB_ATS1PR);
+	else
+		writel_relaxed(va, cb_base + ARM_SMMU_CB_ATS1PR);
 
 	if (readl_poll_timeout_atomic(cb_base + ARM_SMMU_CB_ATSR, tmp,
 				      !(tmp & ATSR_ACTIVE), 5, 50)) {
 		dev_err(dev,
-			"iova to phys timed out on 0x%pad. Falling back to software table walk.\n",
+			"iova to phys timed out on %pad. Falling back to software table walk.\n",
 			&iova);
 		return ops->iova_to_phys(ops, iova);
 	}
@@ -1533,6 +1513,7 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
 	unsigned long size;
 	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
 	u32 id;
+	bool cttw_dt, cttw_reg;
 
 	dev_notice(smmu->dev, "probing hardware configuration...\n");
 	dev_notice(smmu->dev, "SMMUv%d with:\n", smmu->version);
@@ -1572,10 +1553,22 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
 		dev_notice(smmu->dev, "\taddress translation ops\n");
 	}
 
-	if (id & ID0_CTTW) {
+	/*
+	 * In order for DMA API calls to work properly, we must defer to what
+	 * the DT says about coherency, regardless of what the hardware claims.
+	 * Fortunately, this also opens up a workaround for systems where the
+	 * ID register value has ended up configured incorrectly.
+	 */
+	cttw_dt = of_dma_is_coherent(smmu->dev->of_node);
+	cttw_reg = !!(id & ID0_CTTW);
+	if (cttw_dt)
 		smmu->features |= ARM_SMMU_FEAT_COHERENT_WALK;
-		dev_notice(smmu->dev, "\tcoherent table walk\n");
-	}
+	if (cttw_dt || cttw_reg)
+		dev_notice(smmu->dev, "\t%scoherent table walk\n",
+			   cttw_dt ? "" : "non-");
+	if (cttw_dt != cttw_reg)
+		dev_notice(smmu->dev,
+			   "\t(IDR0.CTTW overridden by dma-coherent property)\n");
 
 	if (id & ID0_SMS) {
 		u32 smr, sid, mask;
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
new file mode 100644
index 000000000000..72d6182666cb
--- /dev/null
+++ b/drivers/iommu/dma-iommu.c
@@ -0,0 +1,529 @@
+/*
+ * A fairly generic DMA-API to IOMMU-API glue layer.
+ *
+ * Copyright (C) 2014-2015 ARM Ltd.
+ *
+ * based in part on arch/arm/mm/dma-mapping.c:
+ * Copyright (C) 2000-2004 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-iommu.h>
+#include <linux/gfp.h>
+#include <linux/huge_mm.h>
+#include <linux/iommu.h>
+#include <linux/iova.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/vmalloc.h>
+
+int iommu_dma_init(void)
+{
+	return iova_cache_get();
+}
+
+/**
+ * iommu_get_dma_cookie - Acquire DMA-API resources for a domain
+ * @domain: IOMMU domain to prepare for DMA-API usage
+ *
+ * IOMMU drivers should normally call this from their domain_alloc
+ * callback when domain->type == IOMMU_DOMAIN_DMA.
+ */
+int iommu_get_dma_cookie(struct iommu_domain *domain)
+{
+	struct iova_domain *iovad;
+
+	if (domain->iova_cookie)
+		return -EEXIST;
+
+	iovad = kzalloc(sizeof(*iovad), GFP_KERNEL);
+	domain->iova_cookie = iovad;
+
+	return iovad ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(iommu_get_dma_cookie);
+
+/**
+ * iommu_put_dma_cookie - Release a domain's DMA mapping resources
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
+ *
+ * IOMMU drivers should normally call this from their domain_free callback.
+ */
+void iommu_put_dma_cookie(struct iommu_domain *domain)
+{
+	struct iova_domain *iovad = domain->iova_cookie;
+
+	if (!iovad)
+		return;
+
+	put_iova_domain(iovad);
+	kfree(iovad);
+	domain->iova_cookie = NULL;
+}
+EXPORT_SYMBOL(iommu_put_dma_cookie);
+
+/**
+ * iommu_dma_init_domain - Initialise a DMA mapping domain
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
+ * @base: IOVA at which the mappable address space starts
+ * @size: Size of IOVA space
+ *
+ * @base and @size should be exact multiples of IOMMU page granularity to
+ * avoid rounding surprises. If necessary, we reserve the page at address 0
+ * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
+ * any change which could make prior IOVAs invalid will fail.
+ */
+int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, u64 size)
+{
+	struct iova_domain *iovad = domain->iova_cookie;
+	unsigned long order, base_pfn, end_pfn;
+
+	if (!iovad)
+		return -ENODEV;
+
+	/* Use the smallest supported page size for IOVA granularity */
+	order = __ffs(domain->ops->pgsize_bitmap);
+	base_pfn = max_t(unsigned long, 1, base >> order);
+	end_pfn = (base + size - 1) >> order;
+
+	/* Check the domain allows at least some access to the device... */
+	if (domain->geometry.force_aperture) {
+		if (base > domain->geometry.aperture_end ||
+		    base + size <= domain->geometry.aperture_start) {
+			pr_warn("specified DMA range outside IOMMU capability\n");
+			return -EFAULT;
+		}
+		/* ...then finally give it a kicking to make sure it fits */
+		base_pfn = max_t(unsigned long, base_pfn,
+				domain->geometry.aperture_start >> order);
+		end_pfn = min_t(unsigned long, end_pfn,
+				domain->geometry.aperture_end >> order);
+	}
+
+	/* All we can safely do with an existing domain is enlarge it */
+	if (iovad->start_pfn) {
+		if (1UL << order != iovad->granule ||
+		    base_pfn != iovad->start_pfn ||
+		    end_pfn < iovad->dma_32bit_pfn) {
+			pr_warn("Incompatible range for DMA domain\n");
+			return -EFAULT;
+		}
+		iovad->dma_32bit_pfn = end_pfn;
+	} else {
+		init_iova_domain(iovad, 1UL << order, base_pfn, end_pfn);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(iommu_dma_init_domain);
+
+/**
+ * dma_direction_to_prot - Translate DMA API directions to IOMMU API page flags
+ * @dir: Direction of DMA transfer
+ * @coherent: Is the DMA master cache-coherent?
+ *
+ * Return: corresponding IOMMU API page protection flags
+ */
+int dma_direction_to_prot(enum dma_data_direction dir, bool coherent)
+{
+	int prot = coherent ? IOMMU_CACHE : 0;
+
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return prot | IOMMU_READ | IOMMU_WRITE;
+	case DMA_TO_DEVICE:
+		return prot | IOMMU_READ;
+	case DMA_FROM_DEVICE:
+		return prot | IOMMU_WRITE;
+	default:
+		return 0;
+	}
+}
+
+static struct iova *__alloc_iova(struct iova_domain *iovad, size_t size,
+		dma_addr_t dma_limit)
+{
+	unsigned long shift = iova_shift(iovad);
+	unsigned long length = iova_align(iovad, size) >> shift;
+
+	/*
+	 * Enforce size-alignment to be safe - there could perhaps be an
+	 * attribute to control this per-device, or at least per-domain...
+	 */
+	return alloc_iova(iovad, length, dma_limit >> shift, true);
+}
+
+/* The IOVA allocator knows what we mapped, so just unmap whatever that was */
+static void __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t dma_addr)
+{
+	struct iova_domain *iovad = domain->iova_cookie;
+	unsigned long shift = iova_shift(iovad);
+	unsigned long pfn = dma_addr >> shift;
+	struct iova *iova = find_iova(iovad, pfn);
+	size_t size;
+
+	if (WARN_ON(!iova))
+		return;
+
+	size = iova_size(iova) << shift;
+	size -= iommu_unmap(domain, pfn << shift, size);
+	/* ...and if we can't, then something is horribly, horribly wrong */
+	WARN_ON(size > 0);
+	__free_iova(iovad, iova);
+}
+
+static void __iommu_dma_free_pages(struct page **pages, int count)
+{
+	while (count--)
+		__free_page(pages[count]);
+	kvfree(pages);
+}
+
+static struct page **__iommu_dma_alloc_pages(unsigned int count, gfp_t gfp)
+{
+	struct page **pages;
+	unsigned int i = 0, array_size = count * sizeof(*pages);
+	unsigned int order = MAX_ORDER;
+
+	if (array_size <= PAGE_SIZE)
+		pages = kzalloc(array_size, GFP_KERNEL);
+	else
+		pages = vzalloc(array_size);
+	if (!pages)
+		return NULL;
+
+	/* IOMMU can map any pages, so himem can also be used here */
+	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
+
+	while (count) {
+		struct page *page = NULL;
+		int j;
+
+		/*
+		 * Higher-order allocations are a convenience rather
+		 * than a necessity, hence using __GFP_NORETRY until
+		 * falling back to single-page allocations.
+		 */
+		for (order = min_t(unsigned int, order, __fls(count));
+		     order > 0; order--) {
+			page = alloc_pages(gfp | __GFP_NORETRY, order);
+			if (!page)
+				continue;
+			if (PageCompound(page)) {
+				if (!split_huge_page(page))
+					break;
+				__free_pages(page, order);
+			} else {
+				split_page(page, order);
+				break;
+			}
+		}
+		if (!page)
+			page = alloc_page(gfp);
+		if (!page) {
+			__iommu_dma_free_pages(pages, i);
+			return NULL;
+		}
+		j = 1 << order;
+		count -= j;
+		while (j--)
+			pages[i++] = page++;
+	}
+	return pages;
+}
+
+/**
+ * iommu_dma_free - Free a buffer allocated by iommu_dma_alloc()
+ * @dev: Device which owns this buffer
+ * @pages: Array of buffer pages as returned by iommu_dma_alloc()
+ * @size: Size of buffer in bytes
+ * @handle: DMA address of buffer
+ *
+ * Frees both the pages associated with the buffer, and the array
+ * describing them
+ */
+void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
+		dma_addr_t *handle)
+{
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), *handle);
+	__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
+	*handle = DMA_ERROR_CODE;
+}
+
+/**
+ * iommu_dma_alloc - Allocate and map a buffer contiguous in IOVA space
+ * @dev: Device to allocate memory for. Must be a real device
+ *	 attached to an iommu_dma_domain
+ * @size: Size of buffer in bytes
+ * @gfp: Allocation flags
+ * @prot: IOMMU mapping flags
+ * @handle: Out argument for allocated DMA handle
+ * @flush_page: Arch callback which must ensure PAGE_SIZE bytes from the
+ *		given VA/PA are visible to the given non-coherent device.
+ *
+ * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
+ * but an IOMMU which supports smaller pages might not map the whole thing.
+ *
+ * Return: Array of struct page pointers describing the buffer,
+ *	   or NULL on failure.
+ */
+struct page **iommu_dma_alloc(struct device *dev, size_t size,
+		gfp_t gfp, int prot, dma_addr_t *handle,
+		void (*flush_page)(struct device *, const void *, phys_addr_t))
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iova_domain *iovad = domain->iova_cookie;
+	struct iova *iova;
+	struct page **pages;
+	struct sg_table sgt;
+	dma_addr_t dma_addr;
+	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	*handle = DMA_ERROR_CODE;
+
+	pages = __iommu_dma_alloc_pages(count, gfp);
+	if (!pages)
+		return NULL;
+
+	iova = __alloc_iova(iovad, size, dev->coherent_dma_mask);
+	if (!iova)
+		goto out_free_pages;
+
+	size = iova_align(iovad, size);
+	if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, GFP_KERNEL))
+		goto out_free_iova;
+
+	if (!(prot & IOMMU_CACHE)) {
+		struct sg_mapping_iter miter;
+		/*
+		 * The CPU-centric flushing implied by SG_MITER_TO_SG isn't
+		 * sufficient here, so skip it by using the "wrong" direction.
+		 */
+		sg_miter_start(&miter, sgt.sgl, sgt.orig_nents, SG_MITER_FROM_SG);
+		while (sg_miter_next(&miter))
+			flush_page(dev, miter.addr, page_to_phys(miter.page));
+		sg_miter_stop(&miter);
+	}
+
+	dma_addr = iova_dma_addr(iovad, iova);
+	if (iommu_map_sg(domain, dma_addr, sgt.sgl, sgt.orig_nents, prot)
+			< size)
+		goto out_free_sg;
+
+	*handle = dma_addr;
+	sg_free_table(&sgt);
+	return pages;
+
+out_free_sg:
+	sg_free_table(&sgt);
+out_free_iova:
+	__free_iova(iovad, iova);
+out_free_pages:
+	__iommu_dma_free_pages(pages, count);
+	return NULL;
+}
+
+/**
+ * iommu_dma_mmap - Map a buffer into provided user VMA
+ * @pages: Array representing buffer from iommu_dma_alloc()
+ * @size: Size of buffer in bytes
+ * @vma: VMA describing requested userspace mapping
+ *
+ * Maps the pages of the buffer in @pages into @vma. The caller is responsible
+ * for verifying the correct size and protection of @vma beforehand.
+ */
+
+int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma)
+{
+	unsigned long uaddr = vma->vm_start;
+	unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	int ret = -ENXIO;
+
+	for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) {
+		ret = vm_insert_page(vma, uaddr, pages[i]);
+		if (ret)
+			break;
+		uaddr += PAGE_SIZE;
+	}
+	return ret;
+}
+
+dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, int prot)
+{
+	dma_addr_t dma_addr;
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iova_domain *iovad = domain->iova_cookie;
+	phys_addr_t phys = page_to_phys(page) + offset;
+	size_t iova_off = iova_offset(iovad, phys);
+	size_t len = iova_align(iovad, size + iova_off);
+	struct iova *iova = __alloc_iova(iovad, len, dma_get_mask(dev));
+
+	if (!iova)
+		return DMA_ERROR_CODE;
+
+	dma_addr = iova_dma_addr(iovad, iova);
+	if (iommu_map(domain, dma_addr, phys - iova_off, len, prot)) {
+		__free_iova(iovad, iova);
+		return DMA_ERROR_CODE;
+	}
+	return dma_addr + iova_off;
+}
+
+void iommu_dma_unmap_page(struct device *dev, dma_addr_t handle, size_t size,
+		enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), handle);
+}
+
+/*
+ * Prepare a successfully-mapped scatterlist to give back to the caller.
+ * Handling IOVA concatenation can come later, if needed
+ */
+static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
+		dma_addr_t dma_addr)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		/* Un-swizzling the fields here, hence the naming mismatch */
+		unsigned int s_offset = sg_dma_address(s);
+		unsigned int s_length = sg_dma_len(s);
+		unsigned int s_dma_len = s->length;
+
+		s->offset = s_offset;
+		s->length = s_length;
+		sg_dma_address(s) = dma_addr + s_offset;
+		dma_addr += s_dma_len;
+	}
+	return i;
+}
+
+/*
+ * If mapping failed, then just restore the original list,
+ * but making sure the DMA fields are invalidated.
+ */
+static void __invalidate_sg(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		if (sg_dma_address(s) != DMA_ERROR_CODE)
+			s->offset = sg_dma_address(s);
+		if (sg_dma_len(s))
+			s->length = sg_dma_len(s);
+		sg_dma_address(s) = DMA_ERROR_CODE;
+		sg_dma_len(s) = 0;
+	}
+}
+
+/*
+ * The DMA API client is passing in a scatterlist which could describe
+ * any old buffer layout, but the IOMMU API requires everything to be
+ * aligned to IOMMU pages. Hence the need for this complicated bit of
+ * impedance-matching, to be able to hand off a suitably-aligned list,
+ * but still preserve the original offsets and sizes for the caller.
+ */
+int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
+		int nents, int prot)
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iova_domain *iovad = domain->iova_cookie;
+	struct iova *iova;
+	struct scatterlist *s, *prev = NULL;
+	dma_addr_t dma_addr;
+	size_t iova_len = 0;
+	int i;
+
+	/*
+	 * Work out how much IOVA space we need, and align the segments to
+	 * IOVA granules for the IOMMU driver to handle. With some clever
+	 * trickery we can modify the list in-place, but reversibly, by
+	 * hiding the original data in the as-yet-unused DMA fields.
+	 */
+	for_each_sg(sg, s, nents, i) {
+		size_t s_offset = iova_offset(iovad, s->offset);
+		size_t s_length = s->length;
+
+		sg_dma_address(s) = s_offset;
+		sg_dma_len(s) = s_length;
+		s->offset -= s_offset;
+		s_length = iova_align(iovad, s_length + s_offset);
+		s->length = s_length;
+
+		/*
+		 * The simple way to avoid the rare case of a segment
+		 * crossing the boundary mask is to pad the previous one
+		 * to end at a naturally-aligned IOVA for this one's size,
+		 * at the cost of potentially over-allocating a little.
+		 */
+		if (prev) {
+			size_t pad_len = roundup_pow_of_two(s_length);
+
+			pad_len = (pad_len - iova_len) & (pad_len - 1);
+			prev->length += pad_len;
+			iova_len += pad_len;
+		}
+
+		iova_len += s_length;
+		prev = s;
+	}
+
+	iova = __alloc_iova(iovad, iova_len, dma_get_mask(dev));
+	if (!iova)
+		goto out_restore_sg;
+
+	/*
+	 * We'll leave any physical concatenation to the IOMMU driver's
+	 * implementation - it knows better than we do.
+	 */
+	dma_addr = iova_dma_addr(iovad, iova);
+	if (iommu_map_sg(domain, dma_addr, sg, nents, prot) < iova_len)
+		goto out_free_iova;
+
+	return __finalise_sg(dev, sg, nents, dma_addr);
+
+out_free_iova:
+	__free_iova(iovad, iova);
+out_restore_sg:
+	__invalidate_sg(sg, nents);
+	return 0;
+}
+
+void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	/*
+	 * The scatterlist segments are mapped into a single
+	 * contiguous IOVA allocation, so this is incredibly easy.
+	 */
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), sg_dma_address(sg));
+}
+
+int iommu_dma_supported(struct device *dev, u64 mask)
+{
+	/*
+	 * 'Special' IOMMUs which don't have the same addressing capability
+	 * as the CPU will have to wait until we have some way to query that
+	 * before they'll be able to use this framework.
+	 */
+	return 1;
+}
+
+int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == DMA_ERROR_CODE;
+}
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index b85a8614c128..0a0dfa881c73 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3445,7 +3445,7 @@ static inline int iommu_devinfo_cache_init(void)
 static int __init iommu_init_mempool(void)
 {
 	int ret;
-	ret = iommu_iova_cache_init();
+	ret = iova_cache_get();
 	if (ret)
 		return ret;
 
@@ -3459,7 +3459,7 @@ static int __init iommu_init_mempool(void)
 
 	kmem_cache_destroy(iommu_domain_cache);
 domain_error:
-	iommu_iova_cache_destroy();
+	iova_cache_put();
 
 	return -ENOMEM;
 }
@@ -3468,7 +3468,7 @@ static void __init iommu_exit_mempool(void)
 {
 	kmem_cache_destroy(iommu_devinfo_cache);
 	kmem_cache_destroy(iommu_domain_cache);
-	iommu_iova_cache_destroy();
+	iova_cache_put();
 }
 
 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 937832cfa48e..191f7b1eb425 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
+#include <asm/barrier.h>
+
 #include "io-pgtable.h"
 
 #define ARM_LPAE_MAX_ADDR_BITS		48
@@ -203,6 +205,67 @@ static bool selftest_running = false;
 static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			    unsigned long iova, size_t size, int lvl,
 			    arm_lpae_iopte *ptep);
+static dma_addr_t __arm_lpae_dma_addr(void *pages)
+{
+	return (dma_addr_t)virt_to_phys(pages);
+}
+
+static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
+				    struct io_pgtable_cfg *cfg)
+{
+	struct device *dev = cfg->iommu_dev;
+	dma_addr_t dma;
+	void *pages = alloc_pages_exact(size, gfp | __GFP_ZERO);
+
+	if (!pages)
+		return NULL;
+
+	if (!selftest_running) {
+		dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, dma))
+			goto out_free;
+		/*
+		 * We depend on the IOMMU being able to work with any physical
+		 * address directly, so if the DMA layer suggests otherwise by
+		 * translating or truncating them, that bodes very badly...
+		 */
+		if (dma != virt_to_phys(pages))
+			goto out_unmap;
+	}
+
+	return pages;
+
+out_unmap:
+	dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
+	dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+out_free:
+	free_pages_exact(pages, size);
+	return NULL;
+}
+
+static void __arm_lpae_free_pages(void *pages, size_t size,
+				  struct io_pgtable_cfg *cfg)
+{
+	if (!selftest_running)
+		dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages),
+				 size, DMA_TO_DEVICE);
+	free_pages_exact(pages, size);
+}
+
+static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
+			       struct io_pgtable_cfg *cfg)
+{
+	*ptep = pte;
+
+	if (!selftest_running)
+		dma_sync_single_for_device(cfg->iommu_dev,
+					   __arm_lpae_dma_addr(ptep),
+					   sizeof(pte), DMA_TO_DEVICE);
+}
+
+static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
+			    unsigned long iova, size_t size, int lvl,
+			    arm_lpae_iopte *ptep);
 
 static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 			     unsigned long iova, phys_addr_t paddr,
@@ -210,6 +273,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 			     arm_lpae_iopte *ptep)
 {
 	arm_lpae_iopte pte = prot;
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
 	if (iopte_leaf(*ptep, lvl)) {
 		/* We require an unmap first */
@@ -228,7 +292,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 			return -EINVAL;
 	}
 
-	if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
 		pte |= ARM_LPAE_PTE_NS;
 
 	if (lvl == ARM_LPAE_MAX_LEVELS - 1)
@@ -239,8 +303,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 	pte |= ARM_LPAE_PTE_AF | ARM_LPAE_PTE_SH_IS;
 	pte |= pfn_to_iopte(paddr >> data->pg_shift, data);
 
-	*ptep = pte;
-	data->iop.cfg.tlb->flush_pgtable(ptep, sizeof(*ptep), data->iop.cookie);
+	__arm_lpae_set_pte(ptep, pte, cfg);
 	return 0;
 }
 
@@ -249,14 +312,14 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
 			  int lvl, arm_lpae_iopte *ptep)
 {
 	arm_lpae_iopte *cptep, pte;
-	void *cookie = data->iop.cookie;
 	size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
 	/* Find our entry at the current level */
 	ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
 
 	/* If we can install a leaf entry at this level, then do so */
-	if (size == block_size && (size & data->iop.cfg.pgsize_bitmap))
+	if (size == block_size && (size & cfg->pgsize_bitmap))
 		return arm_lpae_init_pte(data, iova, paddr, prot, lvl, ptep);
 
 	/* We can't allocate tables at the final level */
@@ -266,18 +329,15 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
 	/* Grab a pointer to the next level */
 	pte = *ptep;
 	if (!pte) {
-		cptep = alloc_pages_exact(1UL << data->pg_shift,
-					 GFP_ATOMIC | __GFP_ZERO);
+		cptep = __arm_lpae_alloc_pages(1UL << data->pg_shift,
+					       GFP_ATOMIC, cfg);
 		if (!cptep)
 			return -ENOMEM;
 
-		data->iop.cfg.tlb->flush_pgtable(cptep, 1UL << data->pg_shift,
-						 cookie);
 		pte = __pa(cptep) | ARM_LPAE_PTE_TYPE_TABLE;
-		if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+		if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
 			pte |= ARM_LPAE_PTE_NSTABLE;
-		*ptep = pte;
-		data->iop.cfg.tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+		__arm_lpae_set_pte(ptep, pte, cfg);
 	} else {
 		cptep = iopte_deref(pte, data);
 	}
@@ -324,7 +384,7 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
 {
 	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 	arm_lpae_iopte *ptep = data->pgd;
-	int lvl = ARM_LPAE_START_LVL(data);
+	int ret, lvl = ARM_LPAE_START_LVL(data);
 	arm_lpae_iopte prot;
 
 	/* If no access, then nothing to do */
@@ -332,7 +392,14 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
 		return 0;
 
 	prot = arm_lpae_prot_to_pte(data, iommu_prot);
-	return __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep);
+	ret = __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep);
+	/*
+	 * Synchronise all PTE updates for the new mapping before there's
+	 * a chance for anything to kick off a table walk for the new iova.
+	 */
+	wmb();
+
+	return ret;
 }
 
 static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
@@ -363,7 +430,7 @@ static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
 		__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
 	}
 
-	free_pages_exact(start, table_size);
+	__arm_lpae_free_pages(start, table_size, &data->iop.cfg);
 }
 
 static void arm_lpae_free_pgtable(struct io_pgtable *iop)
@@ -382,8 +449,7 @@ static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
 	unsigned long blk_start, blk_end;
 	phys_addr_t blk_paddr;
 	arm_lpae_iopte table = 0;
-	void *cookie = data->iop.cookie;
-	const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
 	blk_start = iova & ~(blk_size - 1);
 	blk_end = blk_start + blk_size;
@@ -409,10 +475,9 @@ static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
 		}
 	}
 
-	*ptep = table;
-	tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+	__arm_lpae_set_pte(ptep, table, cfg);
 	iova &= ~(blk_size - 1);
-	tlb->tlb_add_flush(iova, blk_size, true, cookie);
+	cfg->tlb->tlb_add_flush(iova, blk_size, true, data->iop.cookie);
 	return size;
 }
 
@@ -434,13 +499,12 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 
 	/* If the size matches this level, we're in the right place */
 	if (size == blk_size) {
-		*ptep = 0;
-		tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+		__arm_lpae_set_pte(ptep, 0, &data->iop.cfg);
 
 		if (!iopte_leaf(pte, lvl)) {
 			/* Also flush any partial walks */
 			tlb->tlb_add_flush(iova, size, false, cookie);
-			tlb->tlb_sync(data->iop.cookie);
+			tlb->tlb_sync(cookie);
 			ptep = iopte_deref(pte, data);
 			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
 		} else {
@@ -566,6 +630,11 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
 	if (cfg->oas > ARM_LPAE_MAX_ADDR_BITS)
 		return NULL;
 
+	if (!selftest_running && cfg->iommu_dev->dma_pfn_offset) {
+		dev_err(cfg->iommu_dev, "Cannot accommodate DMA offset for IOMMU page tables\n");
+		return NULL;
+	}
+
 	data = kmalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return NULL;
@@ -656,11 +725,12 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 	cfg->arm_lpae_s1_cfg.mair[1] = 0;
 
 	/* Looking good; allocate a pgd */
-	data->pgd = alloc_pages_exact(data->pgd_size, GFP_KERNEL | __GFP_ZERO);
+	data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg);
 	if (!data->pgd)
 		goto out_free_data;
 
-	cfg->tlb->flush_pgtable(data->pgd, data->pgd_size, cookie);
+	/* Ensure the empty pgd is visible before any actual TTBR write */
+	wmb();
 
 	/* TTBRs */
 	cfg->arm_lpae_s1_cfg.ttbr[0] = virt_to_phys(data->pgd);
@@ -744,11 +814,12 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
 	cfg->arm_lpae_s2_cfg.vtcr = reg;
 
 	/* Allocate pgd pages */
-	data->pgd = alloc_pages_exact(data->pgd_size, GFP_KERNEL | __GFP_ZERO);
+	data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg);
 	if (!data->pgd)
 		goto out_free_data;
 
-	cfg->tlb->flush_pgtable(data->pgd, data->pgd_size, cookie);
+	/* Ensure the empty pgd is visible before any actual TTBR write */
+	wmb();
 
 	/* VTTBR */
 	cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
@@ -834,16 +905,10 @@ static void dummy_tlb_sync(void *cookie)
 	WARN_ON(cookie != cfg_cookie);
 }
 
-static void dummy_flush_pgtable(void *ptr, size_t size, void *cookie)
-{
-	WARN_ON(cookie != cfg_cookie);
-}
-
 static struct iommu_gather_ops dummy_tlb_ops __initdata = {
 	.tlb_flush_all	= dummy_tlb_flush_all,
 	.tlb_add_flush	= dummy_tlb_add_flush,
 	.tlb_sync	= dummy_tlb_sync,
-	.flush_pgtable	= dummy_flush_pgtable,
 };
 
 static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 6436fe24bc2f..6f2e319d4f04 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -24,11 +24,6 @@
 
 #include "io-pgtable.h"
 
-extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
-
 static const struct io_pgtable_init_fns *
 io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
 {
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 10e32f69c668..ac9e2341a633 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -17,8 +17,9 @@ enum io_pgtable_fmt {
  *
  * @tlb_flush_all: Synchronously invalidate the entire TLB context.
  * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range.
- * @tlb_sync:      Ensure any queue TLB invalidation has taken effect.
- * @flush_pgtable: Ensure page table updates are visible to the IOMMU.
+ * @tlb_sync:      Ensure any queued TLB invalidation has taken effect, and
+ *                 any corresponding page table updates are visible to the
+ *                 IOMMU.
  *
  * Note that these can all be called in atomic context and must therefore
  * not block.
@@ -28,7 +29,6 @@ struct iommu_gather_ops {
 	void (*tlb_add_flush)(unsigned long iova, size_t size, bool leaf,
 			      void *cookie);
 	void (*tlb_sync)(void *cookie);
-	void (*flush_pgtable)(void *ptr, size_t size, void *cookie);
 };
 
 /**
@@ -41,6 +41,8 @@ struct iommu_gather_ops {
  * @ias:           Input address (iova) size, in bits.
  * @oas:           Output address (paddr) size, in bits.
  * @tlb:           TLB management callbacks for this set of tables.
+ * @iommu_dev:     The device representing the DMA configuration for the
+ *                 page table walker.
  */
 struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_NS	(1 << 0)	/* Set NS bit in PTEs */
@@ -49,6 +51,7 @@ struct io_pgtable_cfg {
 	unsigned int			ias;
 	unsigned int			oas;
 	const struct iommu_gather_ops	*tlb;
+	struct device			*iommu_dev;
 
 	/* Low-level data specific to the table format */
 	union {
@@ -140,4 +143,9 @@ struct io_pgtable_init_fns {
 	void (*free)(struct io_pgtable *iop);
 };
 
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
+
 #endif /* __IO_PGTABLE_H */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d4f527e56679..a67b5a43abc6 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -16,7 +16,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-#define pr_fmt(fmt)    "%s: " fmt, __func__
+#define pr_fmt(fmt)    "iommu: " fmt
 
 #include <linux/device.h>
 #include <linux/kernel.h>
@@ -51,6 +51,8 @@ struct iommu_group {
 	void (*iommu_data_release)(void *iommu_data);
 	char *name;
 	int id;
+	struct iommu_domain *default_domain;
+	struct iommu_domain *domain;
 };
 
 struct iommu_device {
@@ -75,6 +77,15 @@ struct iommu_group_attribute iommu_group_attr_##_name =		\
 #define to_iommu_group(_kobj)		\
 	container_of(_kobj, struct iommu_group, kobj)
 
+static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+						 unsigned type);
+static int __iommu_attach_device(struct iommu_domain *domain,
+				 struct device *dev);
+static int __iommu_attach_group(struct iommu_domain *domain,
+				struct iommu_group *group);
+static void __iommu_detach_group(struct iommu_domain *domain,
+				 struct iommu_group *group);
+
 static ssize_t iommu_group_attr_show(struct kobject *kobj,
 				     struct attribute *__attr, char *buf)
 {
@@ -128,6 +139,8 @@ static void iommu_group_release(struct kobject *kobj)
 {
 	struct iommu_group *group = to_iommu_group(kobj);
 
+	pr_debug("Releasing group %d\n", group->id);
+
 	if (group->iommu_data_release)
 		group->iommu_data_release(group->iommu_data);
 
@@ -135,6 +148,9 @@ static void iommu_group_release(struct kobject *kobj)
 	ida_remove(&iommu_group_ida, group->id);
 	mutex_unlock(&iommu_group_mutex);
 
+	if (group->default_domain)
+		iommu_domain_free(group->default_domain);
+
 	kfree(group->name);
 	kfree(group);
 }
@@ -207,6 +223,8 @@ again:
 	 */
 	kobject_put(&group->kobj);
 
+	pr_debug("Allocated group %d\n", group->id);
+
 	return group;
 }
 EXPORT_SYMBOL_GPL(iommu_group_alloc);
@@ -307,6 +325,52 @@ int iommu_group_set_name(struct iommu_group *group, const char *name)
 }
 EXPORT_SYMBOL_GPL(iommu_group_set_name);
 
+static int iommu_group_create_direct_mappings(struct iommu_group *group,
+					      struct device *dev)
+{
+	struct iommu_domain *domain = group->default_domain;
+	struct iommu_dm_region *entry;
+	struct list_head mappings;
+	unsigned long pg_size;
+	int ret = 0;
+
+	if (!domain || domain->type != IOMMU_DOMAIN_DMA)
+		return 0;
+
+	BUG_ON(!domain->ops->pgsize_bitmap);
+
+	pg_size = 1UL << __ffs(domain->ops->pgsize_bitmap);
+	INIT_LIST_HEAD(&mappings);
+
+	iommu_get_dm_regions(dev, &mappings);
+
+	/* We need to consider overlapping regions for different devices */
+	list_for_each_entry(entry, &mappings, list) {
+		dma_addr_t start, end, addr;
+
+		start = ALIGN(entry->start, pg_size);
+		end   = ALIGN(entry->start + entry->length, pg_size);
+
+		for (addr = start; addr < end; addr += pg_size) {
+			phys_addr_t phys_addr;
+
+			phys_addr = iommu_iova_to_phys(domain, addr);
+			if (phys_addr)
+				continue;
+
+			ret = iommu_map(domain, addr, addr, pg_size, entry->prot);
+			if (ret)
+				goto out;
+		}
+
+	}
+
+out:
+	iommu_put_dm_regions(dev, &mappings);
+
+	return ret;
+}
+
 /**
  * iommu_group_add_device - add a device to an iommu group
  * @group: the group into which to add the device (reference should be held)
@@ -363,8 +427,12 @@ rename:
 
 	dev->iommu_group = group;
 
+	iommu_group_create_direct_mappings(group, dev);
+
 	mutex_lock(&group->mutex);
 	list_add_tail(&device->list, &group->devices);
+	if (group->domain)
+		__iommu_attach_device(group->domain, dev);
 	mutex_unlock(&group->mutex);
 
 	/* Notify any listeners about change to group. */
@@ -372,6 +440,9 @@ rename:
 				     IOMMU_GROUP_NOTIFY_ADD_DEVICE, dev);
 
 	trace_add_device_to_group(group->id, dev);
+
+	pr_info("Adding device %s to group %d\n", dev_name(dev), group->id);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_group_add_device);
@@ -388,6 +459,8 @@ void iommu_group_remove_device(struct device *dev)
 	struct iommu_group *group = dev->iommu_group;
 	struct iommu_device *tmp_device, *device = NULL;
 
+	pr_info("Removing device %s from group %d\n", dev_name(dev), group->id);
+
 	/* Pre-notify listeners that a device is being removed. */
 	blocking_notifier_call_chain(&group->notifier,
 				     IOMMU_GROUP_NOTIFY_DEL_DEVICE, dev);
@@ -417,6 +490,17 @@ void iommu_group_remove_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_group_remove_device);
 
+static int iommu_group_device_count(struct iommu_group *group)
+{
+	struct iommu_device *entry;
+	int ret = 0;
+
+	list_for_each_entry(entry, &group->devices, list)
+		ret++;
+
+	return ret;
+}
+
 /**
  * iommu_group_for_each_dev - iterate over each device in the group
  * @group: the group
@@ -428,19 +512,30 @@ EXPORT_SYMBOL_GPL(iommu_group_remove_device);
  * The group->mutex is held across callbacks, which will block calls to
  * iommu_group_add/remove_device.
  */
-int iommu_group_for_each_dev(struct iommu_group *group, void *data,
-			     int (*fn)(struct device *, void *))
+static int __iommu_group_for_each_dev(struct iommu_group *group, void *data,
+				      int (*fn)(struct device *, void *))
 {
 	struct iommu_device *device;
 	int ret = 0;
 
-	mutex_lock(&group->mutex);
 	list_for_each_entry(device, &group->devices, list) {
 		ret = fn(device->dev, data);
 		if (ret)
 			break;
 	}
+	return ret;
+}
+
+
+int iommu_group_for_each_dev(struct iommu_group *group, void *data,
+			     int (*fn)(struct device *, void *))
+{
+	int ret;
+
+	mutex_lock(&group->mutex);
+	ret = __iommu_group_for_each_dev(group, data, fn);
 	mutex_unlock(&group->mutex);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_group_for_each_dev);
@@ -692,7 +787,11 @@ static struct iommu_group *iommu_group_get_for_pci_dev(struct pci_dev *pdev)
 		return group;
 
 	/* No shared group found, allocate new */
-	return iommu_group_alloc();
+	group = iommu_group_alloc();
+	if (IS_ERR(group))
+		return NULL;
+
+	return group;
 }
 
 /**
@@ -722,6 +821,16 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 	if (IS_ERR(group))
 		return group;
 
+	/*
+	 * Try to allocate a default domain - needs support from the
+	 * IOMMU driver.
+	 */
+	if (!group->default_domain) {
+		group->default_domain = __iommu_domain_alloc(dev->bus,
+							     IOMMU_DOMAIN_DMA);
+		group->domain = group->default_domain;
+	}
+
 	ret = iommu_group_add_device(group, dev);
 	if (ret) {
 		iommu_group_put(group);
@@ -731,17 +840,42 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 	return group;
 }
 
+struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
+{
+	return group->default_domain;
+}
+
 static int add_iommu_group(struct device *dev, void *data)
 {
 	struct iommu_callback_data *cb = data;
 	const struct iommu_ops *ops = cb->ops;
+	int ret;
 
 	if (!ops->add_device)
 		return 0;
 
 	WARN_ON(dev->iommu_group);
 
-	ops->add_device(dev);
+	ret = ops->add_device(dev);
+
+	/*
+	 * We ignore -ENODEV errors for now, as they just mean that the
+	 * device is not translated by an IOMMU. We still care about
+	 * other errors and fail to initialize when they happen.
+	 */
+	if (ret == -ENODEV)
+		ret = 0;
+
+	return ret;
+}
+
+static int remove_iommu_group(struct device *dev, void *data)
+{
+	struct iommu_callback_data *cb = data;
+	const struct iommu_ops *ops = cb->ops;
+
+	if (ops->remove_device && dev->iommu_group)
+		ops->remove_device(dev);
 
 	return 0;
 }
@@ -761,7 +895,7 @@ static int iommu_bus_notifier(struct notifier_block *nb,
 	if (action == BUS_NOTIFY_ADD_DEVICE) {
 		if (ops->add_device)
 			return ops->add_device(dev);
-	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
+	} else if (action == BUS_NOTIFY_REMOVED_DEVICE) {
 		if (ops->remove_device && dev->iommu_group) {
 			ops->remove_device(dev);
 			return 0;
@@ -814,19 +948,25 @@ static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops)
 	nb->notifier_call = iommu_bus_notifier;
 
 	err = bus_register_notifier(bus, nb);
-	if (err) {
-		kfree(nb);
-		return err;
-	}
+	if (err)
+		goto out_free;
 
 	err = bus_for_each_dev(bus, NULL, &cb, add_iommu_group);
-	if (err) {
-		bus_unregister_notifier(bus, nb);
-		kfree(nb);
-		return err;
-	}
+	if (err)
+		goto out_err;
+
 
 	return 0;
+
+out_err:
+	/* Clean up */
+	bus_for_each_dev(bus, NULL, &cb, remove_iommu_group);
+	bus_unregister_notifier(bus, nb);
+
+out_free:
+	kfree(nb);
+
+	return err;
 }
 
 /**
@@ -898,22 +1038,28 @@ void iommu_set_fault_handler(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_set_fault_handler);
 
-struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
+static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+						 unsigned type)
 {
 	struct iommu_domain *domain;
 
 	if (bus == NULL || bus->iommu_ops == NULL)
 		return NULL;
 
-	domain = bus->iommu_ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED);
+	domain = bus->iommu_ops->domain_alloc(type);
 	if (!domain)
 		return NULL;
 
 	domain->ops  = bus->iommu_ops;
-	domain->type = IOMMU_DOMAIN_UNMANAGED;
+	domain->type = type;
 
 	return domain;
 }
+
+struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
+{
+	return __iommu_domain_alloc(bus, IOMMU_DOMAIN_UNMANAGED);
+}
 EXPORT_SYMBOL_GPL(iommu_domain_alloc);
 
 void iommu_domain_free(struct iommu_domain *domain)
@@ -922,7 +1068,8 @@ void iommu_domain_free(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL_GPL(iommu_domain_free);
 
-int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
+static int __iommu_attach_device(struct iommu_domain *domain,
+				 struct device *dev)
 {
 	int ret;
 	if (unlikely(domain->ops->attach_dev == NULL))
@@ -933,9 +1080,38 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 		trace_attach_device_to_domain(dev);
 	return ret;
 }
+
+int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
+{
+	struct iommu_group *group;
+	int ret;
+
+	group = iommu_group_get(dev);
+	/* FIXME: Remove this when groups a mandatory for iommu drivers */
+	if (group == NULL)
+		return __iommu_attach_device(domain, dev);
+
+	/*
+	 * We have a group - lock it to make sure the device-count doesn't
+	 * change while we are attaching
+	 */
+	mutex_lock(&group->mutex);
+	ret = -EINVAL;
+	if (iommu_group_device_count(group) != 1)
+		goto out_unlock;
+
+	ret = __iommu_attach_group(domain, group);
+
+out_unlock:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
-void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
+static void __iommu_detach_device(struct iommu_domain *domain,
+				  struct device *dev)
 {
 	if (unlikely(domain->ops->detach_dev == NULL))
 		return;
@@ -943,8 +1119,48 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 	domain->ops->detach_dev(domain, dev);
 	trace_detach_device_from_domain(dev);
 }
+
+void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
+{
+	struct iommu_group *group;
+
+	group = iommu_group_get(dev);
+	/* FIXME: Remove this when groups a mandatory for iommu drivers */
+	if (group == NULL)
+		return __iommu_detach_device(domain, dev);
+
+	mutex_lock(&group->mutex);
+	if (iommu_group_device_count(group) != 1) {
+		WARN_ON(1);
+		goto out_unlock;
+	}
+
+	__iommu_detach_group(domain, group);
+
+out_unlock:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+}
 EXPORT_SYMBOL_GPL(iommu_detach_device);
 
+struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
+{
+	struct iommu_domain *domain;
+	struct iommu_group *group;
+
+	group = iommu_group_get(dev);
+	/* FIXME: Remove this when groups a mandatory for iommu drivers */
+	if (group == NULL)
+		return NULL;
+
+	domain = group->domain;
+
+	iommu_group_put(group);
+
+	return domain;
+}
+EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev);
+
 /*
  * IOMMU groups are really the natrual working unit of the IOMMU, but
  * the IOMMU API works on domains and devices.  Bridge that gap by
@@ -959,13 +1175,34 @@ static int iommu_group_do_attach_device(struct device *dev, void *data)
 {
 	struct iommu_domain *domain = data;
 
-	return iommu_attach_device(domain, dev);
+	return __iommu_attach_device(domain, dev);
+}
+
+static int __iommu_attach_group(struct iommu_domain *domain,
+				struct iommu_group *group)
+{
+	int ret;
+
+	if (group->default_domain && group->domain != group->default_domain)
+		return -EBUSY;
+
+	ret = __iommu_group_for_each_dev(group, domain,
+					 iommu_group_do_attach_device);
+	if (ret == 0)
+		group->domain = domain;
+
+	return ret;
 }
 
 int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group)
 {
-	return iommu_group_for_each_dev(group, domain,
-					iommu_group_do_attach_device);
+	int ret;
+
+	mutex_lock(&group->mutex);
+	ret = __iommu_attach_group(domain, group);
+	mutex_unlock(&group->mutex);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_attach_group);
 
@@ -973,14 +1210,40 @@ static int iommu_group_do_detach_device(struct device *dev, void *data)
 {
 	struct iommu_domain *domain = data;
 
-	iommu_detach_device(domain, dev);
+	__iommu_detach_device(domain, dev);
 
 	return 0;
 }
 
+static void __iommu_detach_group(struct iommu_domain *domain,
+				 struct iommu_group *group)
+{
+	int ret;
+
+	if (!group->default_domain) {
+		__iommu_group_for_each_dev(group, domain,
+					   iommu_group_do_detach_device);
+		group->domain = NULL;
+		return;
+	}
+
+	if (group->domain == group->default_domain)
+		return;
+
+	/* Detach by re-attaching to the default domain */
+	ret = __iommu_group_for_each_dev(group, group->default_domain,
+					 iommu_group_do_attach_device);
+	if (ret != 0)
+		WARN_ON(1);
+	else
+		group->domain = group->default_domain;
+}
+
 void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group)
 {
-	iommu_group_for_each_dev(group, domain, iommu_group_do_detach_device);
+	mutex_lock(&group->mutex);
+	__iommu_detach_group(domain, group);
+	mutex_unlock(&group->mutex);
 }
 EXPORT_SYMBOL_GPL(iommu_detach_group);
 
@@ -1207,7 +1470,7 @@ static int __init iommu_init(void)
 
 	return 0;
 }
-arch_initcall(iommu_init);
+core_initcall(iommu_init);
 
 int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
@@ -1273,3 +1536,72 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
+
+void iommu_get_dm_regions(struct device *dev, struct list_head *list)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+	if (ops && ops->get_dm_regions)
+		ops->get_dm_regions(dev, list);
+}
+
+void iommu_put_dm_regions(struct device *dev, struct list_head *list)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+	if (ops && ops->put_dm_regions)
+		ops->put_dm_regions(dev, list);
+}
+
+/* Request that a device is direct mapped by the IOMMU */
+int iommu_request_dm_for_dev(struct device *dev)
+{
+	struct iommu_domain *dm_domain;
+	struct iommu_group *group;
+	int ret;
+
+	/* Device must already be in a group before calling this function */
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	mutex_lock(&group->mutex);
+
+	/* Check if the default domain is already direct mapped */
+	ret = 0;
+	if (group->default_domain &&
+	    group->default_domain->type == IOMMU_DOMAIN_IDENTITY)
+		goto out;
+
+	/* Don't change mappings of existing devices */
+	ret = -EBUSY;
+	if (iommu_group_device_count(group) != 1)
+		goto out;
+
+	/* Allocate a direct mapped domain */
+	ret = -ENOMEM;
+	dm_domain = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_IDENTITY);
+	if (!dm_domain)
+		goto out;
+
+	/* Attach the device to the domain */
+	ret = __iommu_attach_group(dm_domain, group);
+	if (ret) {
+		iommu_domain_free(dm_domain);
+		goto out;
+	}
+
+	/* Make the direct mapped domain the default for this group */
+	if (group->default_domain)
+		iommu_domain_free(group->default_domain);
+	group->default_domain = dm_domain;
+
+	pr_info("Using direct mapping for device %s\n", dev_name(dev));
+
+	ret = 0;
+out:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+
+	return ret;
+}
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 9dd8208312c2..400f4d1b566f 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -20,40 +20,6 @@
 #include <linux/iova.h>
 #include <linux/slab.h>
 
-static struct kmem_cache *iommu_iova_cache;
-
-int iommu_iova_cache_init(void)
-{
-	int ret = 0;
-
-	iommu_iova_cache = kmem_cache_create("iommu_iova",
-					 sizeof(struct iova),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
-					 NULL);
-	if (!iommu_iova_cache) {
-		pr_err("Couldn't create iova cache\n");
-		ret = -ENOMEM;
-	}
-
-	return ret;
-}
-
-void iommu_iova_cache_destroy(void)
-{
-	kmem_cache_destroy(iommu_iova_cache);
-}
-
-struct iova *alloc_iova_mem(void)
-{
-	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
-}
-
-void free_iova_mem(struct iova *iova)
-{
-	kmem_cache_free(iommu_iova_cache, iova);
-}
-
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn, unsigned long pfn_32bit)
@@ -227,6 +193,7 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova)
 	/* Figure out where to put new node */
 	while (*new) {
 		struct iova *this = container_of(*new, struct iova, node);
+
 		parent = *new;
 
 		if (iova->pfn_lo < this->pfn_lo)
@@ -241,6 +208,55 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova)
 	rb_insert_color(&iova->node, root);
 }
 
+static struct kmem_cache *iova_cache;
+static unsigned int iova_cache_users;
+static DEFINE_MUTEX(iova_cache_mutex);
+
+struct iova *alloc_iova_mem(void)
+{
+	return kmem_cache_alloc(iova_cache, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(alloc_iova_mem);
+
+void free_iova_mem(struct iova *iova)
+{
+	kmem_cache_free(iova_cache, iova);
+}
+EXPORT_SYMBOL(free_iova_mem);
+
+int iova_cache_get(void)
+{
+	mutex_lock(&iova_cache_mutex);
+	if (!iova_cache_users) {
+		iova_cache = kmem_cache_create(
+			"iommu_iova", sizeof(struct iova), 0,
+			SLAB_HWCACHE_ALIGN, NULL);
+		if (!iova_cache) {
+			mutex_unlock(&iova_cache_mutex);
+			printk(KERN_ERR "Couldn't create iova cache\n");
+			return -ENOMEM;
+		}
+	}
+
+	iova_cache_users++;
+	mutex_unlock(&iova_cache_mutex);
+
+	return 0;
+}
+
+void iova_cache_put(void)
+{
+	mutex_lock(&iova_cache_mutex);
+	if (WARN_ON(!iova_cache_users)) {
+		mutex_unlock(&iova_cache_mutex);
+		return;
+	}
+	iova_cache_users--;
+	if (!iova_cache_users)
+		kmem_cache_destroy(iova_cache);
+	mutex_unlock(&iova_cache_mutex);
+}
+
 /**
  * alloc_iova - allocates an iova
  * @iovad: - iova domain in question
@@ -350,6 +366,7 @@ void
 free_iova(struct iova_domain *iovad, unsigned long pfn)
 {
 	struct iova *iova = find_iova(iovad, pfn);
+
 	if (iova)
 		__free_iova(iovad, iova);
 
@@ -369,6 +386,7 @@ void put_iova_domain(struct iova_domain *iovad)
 	node = rb_first(&iovad->rbroot);
 	while (node) {
 		struct iova *iova = container_of(node, struct iova, node);
+
 		rb_erase(node, &iovad->rbroot);
 		free_iova_mem(iova);
 		node = rb_first(&iovad->rbroot);
@@ -482,6 +500,7 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
 	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
 		struct iova *iova = container_of(node, struct iova, node);
 		struct iova *new_iova;
+
 		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
 		if (!new_iova)
 			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 1a67c531a07e..8cf605fa9946 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -283,24 +283,10 @@ static void ipmmu_tlb_add_flush(unsigned long iova, size_t size, bool leaf,
 	/* The hardware doesn't support selective TLB flush. */
 }
 
-static void ipmmu_flush_pgtable(void *ptr, size_t size, void *cookie)
-{
-	unsigned long offset = (unsigned long)ptr & ~PAGE_MASK;
-	struct ipmmu_vmsa_domain *domain = cookie;
-
-	/*
-	 * TODO: Add support for coherent walk through CCI with DVM and remove
-	 * cache handling.
-	 */
-	dma_map_page(domain->mmu->dev, virt_to_page(ptr), offset, size,
-		     DMA_TO_DEVICE);
-}
-
 static struct iommu_gather_ops ipmmu_gather_ops = {
 	.tlb_flush_all = ipmmu_tlb_flush_all,
 	.tlb_add_flush = ipmmu_tlb_add_flush,
 	.tlb_sync = ipmmu_tlb_flush_all,
-	.flush_pgtable = ipmmu_flush_pgtable,
 };
 
 /* -----------------------------------------------------------------------------
@@ -327,6 +313,11 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
 	domain->cfg.ias = 32;
 	domain->cfg.oas = 40;
 	domain->cfg.tlb = &ipmmu_gather_ops;
+	/*
+	 * TODO: Add support for coherent walk through CCI with DVM and remove
+	 * cache handling. For now, delegate it to the io-pgtable code.
+	 */
+	domain->cfg.iommu_dev = domain->mmu->dev;
 
 	domain->iop = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &domain->cfg,
 					   domain);
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 43429ab62228..60ba238090d9 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -141,10 +141,12 @@ struct iommu_ops *of_iommu_configure(struct device *dev,
 	struct iommu_ops *ops = NULL;
 	int idx = 0;
 
-	if (dev_is_pci(dev)) {
-		dev_err(dev, "IOMMU is currently not supported for PCI\n");
+	/*
+	 * We can't do much for PCI devices without knowing how
+	 * device IDs are wired up from the PCI bus to the IOMMU.
+	 */
+	if (dev_is_pci(dev))
 		return NULL;
-	}
 
 	/*
 	 * We don't currently walk up the tree looking for a parent IOMMU.
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index dda4927e47a6..801bd75f7266 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_ARCH_SPEAR3XX)		+= spear-shirq.o
 obj-$(CONFIG_ARM_GIC)			+= irq-gic.o irq-gic-common.o
 obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
 obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-common.o
-obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o
+obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o
 obj-$(CONFIG_ARM_NVIC)			+= irq-nvic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
 obj-$(CONFIG_ATMEL_AIC_IRQ)		+= irq-atmel-aic-common.o irq-atmel-aic.o
diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index fdf706555d72..ec9c37674a0b 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -45,7 +45,6 @@
 
 struct v2m_data {
 	spinlock_t msi_cnt_lock;
-	struct msi_controller mchip;
 	struct resource res;	/* GICv2m resource */
 	void __iomem *base;	/* GICv2m virt address */
 	u32 spi_start;		/* The SPI number that MSIs start */
@@ -218,6 +217,7 @@ static int __init gicv2m_init_one(struct device_node *node,
 {
 	int ret;
 	struct v2m_data *v2m;
+	struct irq_domain *inner_domain;
 
 	v2m = kzalloc(sizeof(struct v2m_data), GFP_KERNEL);
 	if (!v2m) {
@@ -261,19 +261,18 @@ static int __init gicv2m_init_one(struct device_node *node,
 		goto err_iounmap;
 	}
 
-	v2m->domain = irq_domain_add_tree(NULL, &gicv2m_domain_ops, v2m);
-	if (!v2m->domain) {
+	inner_domain = irq_domain_add_tree(node, &gicv2m_domain_ops, v2m);
+	if (!inner_domain) {
 		pr_err("Failed to create GICv2m domain\n");
 		ret = -ENOMEM;
 		goto err_free_bm;
 	}
 
-	v2m->domain->parent = parent;
-	v2m->mchip.of_node = node;
-	v2m->mchip.domain = pci_msi_create_irq_domain(node,
-						      &gicv2m_msi_domain_info,
-						      v2m->domain);
-	if (!v2m->mchip.domain) {
+	inner_domain->bus_token = DOMAIN_BUS_NEXUS;
+	inner_domain->parent = parent;
+	v2m->domain = pci_msi_create_irq_domain(node, &gicv2m_msi_domain_info,
+						inner_domain);
+	if (!v2m->domain) {
 		pr_err("Failed to create MSI domain\n");
 		ret = -ENOMEM;
 		goto err_free_domains;
@@ -281,12 +280,6 @@ static int __init gicv2m_init_one(struct device_node *node,
 
 	spin_lock_init(&v2m->msi_cnt_lock);
 
-	ret = of_pci_msi_chip_add(&v2m->mchip);
-	if (ret) {
-		pr_err("Failed to add msi_chip.\n");
-		goto err_free_domains;
-	}
-
 	pr_info("Node %s: range[%#lx:%#lx], SPI[%d:%d]\n", node->name,
 		(unsigned long)v2m->res.start, (unsigned long)v2m->res.end,
 		v2m->spi_start, (v2m->spi_start + v2m->nr_spis));
@@ -294,10 +287,10 @@ static int __init gicv2m_init_one(struct device_node *node,
 	return 0;
 
 err_free_domains:
-	if (v2m->mchip.domain)
-		irq_domain_remove(v2m->mchip.domain);
 	if (v2m->domain)
 		irq_domain_remove(v2m->domain);
+	if (inner_domain)
+		irq_domain_remove(inner_domain);
 err_free_bm:
 	kfree(v2m->bm);
 err_iounmap:
diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
new file mode 100644
index 000000000000..cf351c637464
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2013-2015 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_pci.h>
+
+static void its_mask_msi_irq(struct irq_data *d)
+{
+	pci_msi_mask_irq(d);
+	irq_chip_mask_parent(d);
+}
+
+static void its_unmask_msi_irq(struct irq_data *d)
+{
+	pci_msi_unmask_irq(d);
+	irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip its_msi_irq_chip = {
+	.name			= "ITS-MSI",
+	.irq_unmask		= its_unmask_msi_irq,
+	.irq_mask		= its_mask_msi_irq,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_write_msi_msg	= pci_msi_domain_write_msg,
+};
+
+struct its_pci_alias {
+	struct pci_dev	*pdev;
+	u32		dev_id;
+	u32		count;
+};
+
+static int its_pci_msi_vec_count(struct pci_dev *pdev)
+{
+	int msi, msix;
+
+	msi = max(pci_msi_vec_count(pdev), 0);
+	msix = max(pci_msix_vec_count(pdev), 0);
+
+	return max(msi, msix);
+}
+
+static int its_get_pci_alias(struct pci_dev *pdev, u16 alias, void *data)
+{
+	struct its_pci_alias *dev_alias = data;
+
+	dev_alias->dev_id = alias;
+	if (pdev != dev_alias->pdev)
+		dev_alias->count += its_pci_msi_vec_count(dev_alias->pdev);
+
+	return 0;
+}
+
+static int its_pci_msi_prepare(struct irq_domain *domain, struct device *dev,
+			       int nvec, msi_alloc_info_t *info)
+{
+	struct pci_dev *pdev;
+	struct its_pci_alias dev_alias;
+	struct msi_domain_info *msi_info;
+
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	msi_info = msi_get_domain_info(domain->parent);
+
+	pdev = to_pci_dev(dev);
+	dev_alias.pdev = pdev;
+	dev_alias.count = nvec;
+
+	pci_for_each_dma_alias(pdev, its_get_pci_alias, &dev_alias);
+
+	/* ITS specific DeviceID, as the core ITS ignores dev. */
+	info->scratchpad[0].ul = dev_alias.dev_id;
+
+	return msi_info->ops->msi_prepare(domain->parent,
+					  dev, dev_alias.count, info);
+}
+
+static struct msi_domain_ops its_pci_msi_ops = {
+	.msi_prepare	= its_pci_msi_prepare,
+};
+
+static struct msi_domain_info its_pci_msi_domain_info = {
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+		   MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX),
+	.ops	= &its_pci_msi_ops,
+	.chip	= &its_msi_irq_chip,
+};
+
+static struct of_device_id its_device_id[] = {
+	{	.compatible	= "arm,gic-v3-its",	},
+	{},
+};
+
+static int __init its_pci_msi_init(void)
+{
+	struct device_node *np;
+	struct irq_domain *parent;
+
+	for (np = of_find_matching_node(NULL, its_device_id); np;
+	     np = of_find_matching_node(np, its_device_id)) {
+		if (!of_property_read_bool(np, "msi-controller"))
+			continue;
+
+		parent = irq_find_matching_host(np, DOMAIN_BUS_NEXUS);
+		if (!parent || !msi_get_domain_info(parent)) {
+			pr_err("%s: unable to locate ITS domain\n",
+			       np->full_name);
+			continue;
+		}
+
+		if (!pci_msi_create_irq_domain(np, &its_pci_msi_domain_info,
+					       parent)) {
+			pr_err("%s: unable to create PCI domain\n",
+			       np->full_name);
+			continue;
+		}
+
+		pr_info("PCI/MSI: %s domain created\n", np->full_name);
+	}
+
+	return 0;
+}
+early_initcall(its_pci_msi_init);
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 9a791dd52199..78db25293384 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -54,14 +54,12 @@ struct its_collection {
 
 /*
  * The ITS structure - contains most of the infrastructure, with the
- * msi_controller, the command queue, the collections, and the list of
- * devices writing to it.
+ * top-level MSI domain, the command queue, the collections, and the
+ * list of devices writing to it.
  */
 struct its_node {
 	raw_spinlock_t		lock;
 	struct list_head	entry;
-	struct msi_controller	msi_chip;
-	struct irq_domain	*domain;
 	void __iomem		*base;
 	unsigned long		phys_base;
 	struct its_cmd_block	*cmd_base;
@@ -643,26 +641,6 @@ static struct irq_chip its_irq_chip = {
 	.irq_compose_msi_msg	= its_irq_compose_msi_msg,
 };
 
-static void its_mask_msi_irq(struct irq_data *d)
-{
-	pci_msi_mask_irq(d);
-	irq_chip_mask_parent(d);
-}
-
-static void its_unmask_msi_irq(struct irq_data *d)
-{
-	pci_msi_unmask_irq(d);
-	irq_chip_unmask_parent(d);
-}
-
-static struct irq_chip its_msi_irq_chip = {
-	.name			= "ITS-MSI",
-	.irq_unmask		= its_unmask_msi_irq,
-	.irq_mask		= its_mask_msi_irq,
-	.irq_eoi		= irq_chip_eoi_parent,
-	.irq_write_msi_msg	= pci_msi_domain_write_msg,
-};
-
 /*
  * How we allocate LPIs:
  *
@@ -742,6 +720,9 @@ static unsigned long *its_lpi_alloc_chunks(int nr_irqs, int *base, int *nr_ids)
 out:
 	spin_unlock(&lpi_lock);
 
+	if (!bitmap)
+		*base = *nr_ids = 0;
+
 	return bitmap;
 }
 
@@ -831,7 +812,7 @@ static void its_free_tables(struct its_node *its)
 	}
 }
 
-static int its_alloc_tables(struct its_node *its)
+static int its_alloc_tables(const char *node_name, struct its_node *its)
 {
 	int err;
 	int i;
@@ -874,7 +855,7 @@ static int its_alloc_tables(struct its_node *its)
 			if (order >= MAX_ORDER) {
 				order = MAX_ORDER - 1;
 				pr_warn("%s: Device Table too large, reduce its page order to %u\n",
-					its->msi_chip.of_node->full_name, order);
+					node_name, order);
 			}
 		}
 
@@ -946,7 +927,7 @@ retry_baser:
 
 		if (val != tmp) {
 			pr_err("ITS: %s: GITS_BASER%d doesn't stick: %lx %lx\n",
-			       its->msi_chip.of_node->full_name, i,
+			       node_name, i,
 			       (unsigned long) val, (unsigned long) tmp);
 			err = -ENXIO;
 			goto out_free;
@@ -1213,85 +1194,50 @@ static int its_alloc_device_irq(struct its_device *dev, irq_hw_number_t *hwirq)
 	return 0;
 }
 
-struct its_pci_alias {
-	struct pci_dev	*pdev;
-	u32		dev_id;
-	u32		count;
-};
-
-static int its_pci_msi_vec_count(struct pci_dev *pdev)
-{
-	int msi, msix;
-
-	msi = max(pci_msi_vec_count(pdev), 0);
-	msix = max(pci_msix_vec_count(pdev), 0);
-
-	return max(msi, msix);
-}
-
-static int its_get_pci_alias(struct pci_dev *pdev, u16 alias, void *data)
-{
-	struct its_pci_alias *dev_alias = data;
-
-	dev_alias->dev_id = alias;
-	if (pdev != dev_alias->pdev)
-		dev_alias->count += its_pci_msi_vec_count(dev_alias->pdev);
-
-	return 0;
-}
-
 static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
 			   int nvec, msi_alloc_info_t *info)
 {
-	struct pci_dev *pdev;
 	struct its_node *its;
 	struct its_device *its_dev;
-	struct its_pci_alias dev_alias;
-
-	if (!dev_is_pci(dev))
-		return -EINVAL;
+	struct msi_domain_info *msi_info;
+	u32 dev_id;
 
-	pdev = to_pci_dev(dev);
-	dev_alias.pdev = pdev;
-	dev_alias.count = nvec;
+	/*
+	 * We ignore "dev" entierely, and rely on the dev_id that has
+	 * been passed via the scratchpad. This limits this domain's
+	 * usefulness to upper layers that definitely know that they
+	 * are built on top of the ITS.
+	 */
+	dev_id = info->scratchpad[0].ul;
 
-	pci_for_each_dma_alias(pdev, its_get_pci_alias, &dev_alias);
-	its = domain->parent->host_data;
+	msi_info = msi_get_domain_info(domain);
+	its = msi_info->data;
 
-	its_dev = its_find_device(its, dev_alias.dev_id);
+	its_dev = its_find_device(its, dev_id);
 	if (its_dev) {
 		/*
 		 * We already have seen this ID, probably through
 		 * another alias (PCI bridge of some sort). No need to
 		 * create the device.
 		 */
-		dev_dbg(dev, "Reusing ITT for devID %x\n", dev_alias.dev_id);
+		pr_debug("Reusing ITT for devID %x\n", dev_id);
 		goto out;
 	}
 
-	its_dev = its_create_device(its, dev_alias.dev_id, dev_alias.count);
+	its_dev = its_create_device(its, dev_id, nvec);
 	if (!its_dev)
 		return -ENOMEM;
 
-	dev_dbg(&pdev->dev, "ITT %d entries, %d bits\n",
-		dev_alias.count, ilog2(dev_alias.count));
+	pr_debug("ITT %d entries, %d bits\n", nvec, ilog2(nvec));
 out:
 	info->scratchpad[0].ptr = its_dev;
-	info->scratchpad[1].ptr = dev;
 	return 0;
 }
 
-static struct msi_domain_ops its_pci_msi_ops = {
+static struct msi_domain_ops its_msi_domain_ops = {
 	.msi_prepare	= its_msi_prepare,
 };
 
-static struct msi_domain_info its_pci_msi_domain_info = {
-	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-		   MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX),
-	.ops	= &its_pci_msi_ops,
-	.chip	= &its_msi_irq_chip,
-};
-
 static int its_irq_gic_domain_alloc(struct irq_domain *domain,
 				    unsigned int virq,
 				    irq_hw_number_t hwirq)
@@ -1327,9 +1273,9 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 		irq_domain_set_hwirq_and_chip(domain, virq + i,
 					      hwirq, &its_irq_chip, its_dev);
-		dev_dbg(info->scratchpad[1].ptr, "ID:%d pID:%d vID:%d\n",
-			(int)(hwirq - its_dev->event_map.lpi_base),
-			(int)hwirq, virq + i);
+		pr_debug("ID:%d pID:%d vID:%d\n",
+			 (int)(hwirq - its_dev->event_map.lpi_base),
+			 (int) hwirq, virq + i);
 	}
 
 	return 0;
@@ -1430,6 +1376,7 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 	struct resource res;
 	struct its_node *its;
 	void __iomem *its_base;
+	struct irq_domain *inner_domain;
 	u32 val;
 	u64 baser, tmp;
 	int err;
@@ -1473,7 +1420,6 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 	INIT_LIST_HEAD(&its->its_device_list);
 	its->base = its_base;
 	its->phys_base = res.start;
-	its->msi_chip.of_node = node;
 	its->ite_size = ((readl_relaxed(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
 
 	its->cmd_base = kzalloc(ITS_CMD_QUEUE_SZ, GFP_KERNEL);
@@ -1483,7 +1429,7 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 	}
 	its->cmd_write = its->cmd_base;
 
-	err = its_alloc_tables(its);
+	err = its_alloc_tables(node->full_name, its);
 	if (err)
 		goto out_free_cmd;
 
@@ -1519,26 +1465,27 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 	writeq_relaxed(0, its->base + GITS_CWRITER);
 	writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR);
 
-	if (of_property_read_bool(its->msi_chip.of_node, "msi-controller")) {
-		its->domain = irq_domain_add_tree(NULL, &its_domain_ops, its);
-		if (!its->domain) {
+	if (of_property_read_bool(node, "msi-controller")) {
+		struct msi_domain_info *info;
+
+		info = kzalloc(sizeof(*info), GFP_KERNEL);
+		if (!info) {
 			err = -ENOMEM;
 			goto out_free_tables;
 		}
 
-		its->domain->parent = parent;
-
-		its->msi_chip.domain = pci_msi_create_irq_domain(node,
-								 &its_pci_msi_domain_info,
-								 its->domain);
-		if (!its->msi_chip.domain) {
+		inner_domain = irq_domain_add_tree(node, &its_domain_ops, its);
+		if (!inner_domain) {
 			err = -ENOMEM;
-			goto out_free_domains;
+			kfree(info);
+			goto out_free_tables;
 		}
 
-		err = of_pci_msi_chip_add(&its->msi_chip);
-		if (err)
-			goto out_free_domains;
+		inner_domain->parent = parent;
+		inner_domain->bus_token = DOMAIN_BUS_NEXUS;
+		info->ops = &its_msi_domain_ops;
+		info->data = its;
+		inner_domain->host_data = info;
 	}
 
 	spin_lock(&its_lock);
@@ -1547,11 +1494,6 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 
 	return 0;
 
-out_free_domains:
-	if (its->msi_chip.domain)
-		irq_domain_remove(its->msi_chip.domain);
-	if (its->domain)
-		irq_domain_remove(its->domain);
 out_free_tables:
 	its_free_tables(its);
 out_free_cmd:
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ab43faddb447..9c083b9d02d3 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/hash.h>
 #include <linux/random.h>
+#include <linux/backing-dev.h>
 
 #include <trace/events/bcache.h>
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 87de9a0848b7..277f977d0101 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2166,7 +2166,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 			 * the query about congestion status of request_queue
 			 */
 			if (dm_request_based(md))
-				r = md->queue->backing_dev_info.state &
+				r = md->queue->backing_dev_info.wb.state &
 				    bdi_bits;
 			else
 				r = dm_table_any_congested(map, bdi_bits);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6123c2bf9150..4e984993d40a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/kobject.h>
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4046a6c6f223..7da6e9c3cb53 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -16,6 +16,7 @@
 #define _MD_MD_H
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mm.h>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bff6c1c7fecb..1342cd2ccb24 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -745,7 +745,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
 	struct r1conf *conf = mddev->private;
 	int i, ret = 0;
 
-	if ((bits & (1 << BDI_async_congested)) &&
+	if ((bits & (1 << WB_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 
@@ -760,7 +760,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
 			/* Note the '|| 1' - when read_balance prefers
 			 * non-congested targets, it can be removed
 			 */
-			if ((bits & (1<<BDI_async_congested)) || 1)
+			if ((bits & (1 << WB_async_congested)) || 1)
 				ret |= bdi_congested(&q->backing_dev_info, bits);
 			else
 				ret &= bdi_congested(&q->backing_dev_info, bits);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index adfc83a0f023..646074d33ceb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -914,7 +914,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
 	struct r10conf *conf = mddev->private;
 	int i, ret = 0;
 
-	if ((bits & (1 << BDI_async_congested)) &&
+	if ((bits & (1 << WB_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 
diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c
index b5abe34120b8..da9fb01cd441 100644
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -103,6 +103,7 @@ enum ctype {
 	CT_EXEC_USERSPACE,
 	CT_ACCESS_USERSPACE,
 	CT_WRITE_RO,
+	CT_WRITE_RO_AFTER_INIT,
 	CT_WRITE_KERN,
 };
 
@@ -140,6 +141,7 @@ static char* cp_type[] = {
 	"EXEC_USERSPACE",
 	"ACCESS_USERSPACE",
 	"WRITE_RO",
+	"WRITE_RO_AFTER_INIT",
 	"WRITE_KERN",
 };
 
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
 static u8 data_area[EXEC_SIZE];
 
 static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
 
 module_param(recur_count, int, 0644);
 MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -497,11 +500,28 @@ static void lkdtm_do_action(enum ctype which)
 		break;
 	}
 	case CT_WRITE_RO: {
-		unsigned long *ptr;
+		/* Explicitly cast away "const" for the test. */
+		unsigned long *ptr = (unsigned long *)&rodata;
 
-		ptr = (unsigned long *)&rodata;
+		pr_info("attempting bad rodata write at %p\n", ptr);
+		*ptr ^= 0xabcd1234;
 
-		pr_info("attempting bad write at %p\n", ptr);
+		break;
+	}
+	case CT_WRITE_RO_AFTER_INIT: {
+		unsigned long *ptr = &ro_after_init;
+
+		/*
+		 * Verify we were written to during init. Since an Oops
+		 * is considered a "success", a failure is to just skip the
+		 * real test.
+		 */
+		if ((*ptr & 0xAA) != 0xAA) {
+			pr_info("%p was NOT written during init!?\n", ptr);
+			break;
+		}
+
+		pr_info("attempting bad ro_after_init write at %p\n", ptr);
 		*ptr ^= 0xabcd1234;
 
 		break;
@@ -811,6 +831,9 @@ static int __init lkdtm_module_init(void)
 	int n_debugfs_entries = 1; /* Assume only the direct entry */
 	int i;
 
+	/* Make sure we can write to __ro_after_init values during __init */
+	ro_after_init |= 0xAA;
+
 	/* Register debugfs interface */
 	lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
 	if (!lkdtm_debugfs_root) {
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index b16f3cda97ff..e2c0057737e6 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/pagemap.h>
 #include <linux/list.h>
diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index d37928f01949..60a672496484 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -9,6 +9,8 @@
 
 #include <linux/delay.h>
 #include <asm/cmpxchg.h>
+#include <linux/moduleparam.h>
+#include <linux/atomic.h>
 #include "net_driver.h"
 #include "nic.h"
 #include "io.h"
diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index f65310c3ba5f..9134e79fa8db 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -598,7 +598,7 @@ struct ath10k {
 	bool monitor_started;
 	unsigned int filter_flags;
 	unsigned long dev_flags;
-	u32 dfs_block_radar_events;
+	bool dfs_block_radar_events;
 
 	/* protected by conf_mutex */
 	bool radar_enabled;
diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h
index 7ca0d6f930fd..cce76b2dabce 100644
--- a/drivers/net/wireless/ath/ath5k/ath5k.h
+++ b/drivers/net/wireless/ath/ath5k/ath5k.h
@@ -1366,7 +1366,7 @@ struct ath5k_hw {
 	u8			ah_retry_long;
 	u8			ah_retry_short;
 
-	u32			ah_use_32khz_clock;
+	bool			ah_use_32khz_clock;
 
 	u8			ah_coverage_class;
 	bool			ah_ack_bitrate_high;
diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index 5e15e8e10ed3..1ad05024d8a3 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -384,7 +384,7 @@ static void ath9k_hw_init_config(struct ath_hw *ah)
 
 	ah->config.dma_beacon_response_time = 1;
 	ah->config.sw_beacon_response_time = 6;
-	ah->config.cwm_ignore_extcca = 0;
+	ah->config.cwm_ignore_extcca = false;
 	ah->config.analog_shiftreg = 1;
 
 	ah->config.rx_intr_mitigation = true;
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index c1d2d0340feb..6368c664483a 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -332,14 +332,14 @@ enum ath9k_hw_hang_checks {
 struct ath9k_ops_config {
 	int dma_beacon_response_time;
 	int sw_beacon_response_time;
-	u32 cwm_ignore_extcca;
+	bool cwm_ignore_extcca;
 	u32 pcie_waen;
 	u8 analog_shiftreg;
 	u32 ofdm_trig_low;
 	u32 ofdm_trig_high;
 	u32 cck_trig_high;
 	u32 cck_trig_low;
-	u32 enable_paprd;
+	bool enable_paprd;
 	int serialize_regmode;
 	bool rx_intr_mitigation;
 	bool tx_intr_mitigation;
diff --git a/drivers/net/wireless/b43/debugfs.c b/drivers/net/wireless/b43/debugfs.c
index e807bd930647..b4bcd94aff6c 100644
--- a/drivers/net/wireless/b43/debugfs.c
+++ b/drivers/net/wireless/b43/debugfs.c
@@ -676,15 +676,15 @@ static void b43_add_dynamic_debug(struct b43_wldev *dev)
 		e->dyn_debug_dentries[id] = d;		\
 				} while (0)
 
-	add_dyn_dbg("debug_xmitpower", B43_DBG_XMITPOWER, 0);
-	add_dyn_dbg("debug_dmaoverflow", B43_DBG_DMAOVERFLOW, 0);
-	add_dyn_dbg("debug_dmaverbose", B43_DBG_DMAVERBOSE, 0);
-	add_dyn_dbg("debug_pwork_fast", B43_DBG_PWORK_FAST, 0);
-	add_dyn_dbg("debug_pwork_stop", B43_DBG_PWORK_STOP, 0);
-	add_dyn_dbg("debug_lo", B43_DBG_LO, 0);
-	add_dyn_dbg("debug_firmware", B43_DBG_FIRMWARE, 0);
-	add_dyn_dbg("debug_keys", B43_DBG_KEYS, 0);
-	add_dyn_dbg("debug_verbose_stats", B43_DBG_VERBOSESTATS, 0);
+	add_dyn_dbg("debug_xmitpower", B43_DBG_XMITPOWER, false);
+	add_dyn_dbg("debug_dmaoverflow", B43_DBG_DMAOVERFLOW, false);
+	add_dyn_dbg("debug_dmaverbose", B43_DBG_DMAVERBOSE, false);
+	add_dyn_dbg("debug_pwork_fast", B43_DBG_PWORK_FAST, false);
+	add_dyn_dbg("debug_pwork_stop", B43_DBG_PWORK_STOP, false);
+	add_dyn_dbg("debug_lo", B43_DBG_LO, false);
+	add_dyn_dbg("debug_firmware", B43_DBG_FIRMWARE, false);
+	add_dyn_dbg("debug_keys", B43_DBG_KEYS, false);
+	add_dyn_dbg("debug_verbose_stats", B43_DBG_VERBOSESTATS, false);
 
 #undef add_dyn_dbg
 }
diff --git a/drivers/net/wireless/b43/debugfs.h b/drivers/net/wireless/b43/debugfs.h
index 50517b801cb4..d05377745011 100644
--- a/drivers/net/wireless/b43/debugfs.h
+++ b/drivers/net/wireless/b43/debugfs.h
@@ -68,7 +68,7 @@ struct b43_dfsentry {
 	u32 shm32read_addr_next;
 
 	/* Enabled/Disabled list for the dynamic debugging features. */
-	u32 dyn_debug[__B43_NR_DYNDBG];
+	bool dyn_debug[__B43_NR_DYNDBG];
 	/* Dentries for the dynamic debugging entries. */
 	struct dentry *dyn_debug_dentries[__B43_NR_DYNDBG];
 };
diff --git a/drivers/net/wireless/b43legacy/debugfs.c b/drivers/net/wireless/b43legacy/debugfs.c
index 1965edb765a2..090910ea259e 100644
--- a/drivers/net/wireless/b43legacy/debugfs.c
+++ b/drivers/net/wireless/b43legacy/debugfs.c
@@ -369,11 +369,11 @@ static void b43legacy_add_dynamic_debug(struct b43legacy_wldev *dev)
 		e->dyn_debug_dentries[id] = d;		\
 				} while (0)
 
-	add_dyn_dbg("debug_xmitpower", B43legacy_DBG_XMITPOWER, 0);
-	add_dyn_dbg("debug_dmaoverflow", B43legacy_DBG_DMAOVERFLOW, 0);
-	add_dyn_dbg("debug_dmaverbose", B43legacy_DBG_DMAVERBOSE, 0);
-	add_dyn_dbg("debug_pwork_fast", B43legacy_DBG_PWORK_FAST, 0);
-	add_dyn_dbg("debug_pwork_stop", B43legacy_DBG_PWORK_STOP, 0);
+	add_dyn_dbg("debug_xmitpower", B43legacy_DBG_XMITPOWER, false);
+	add_dyn_dbg("debug_dmaoverflow", B43legacy_DBG_DMAOVERFLOW, false);
+	add_dyn_dbg("debug_dmaverbose", B43legacy_DBG_DMAVERBOSE, false);
+	add_dyn_dbg("debug_pwork_fast", B43legacy_DBG_PWORK_FAST, false);
+	add_dyn_dbg("debug_pwork_stop", B43legacy_DBG_PWORK_STOP, false);
 
 #undef add_dyn_dbg
 }
diff --git a/drivers/net/wireless/b43legacy/debugfs.h b/drivers/net/wireless/b43legacy/debugfs.h
index ae3b0d0fa849..9ee32158b947 100644
--- a/drivers/net/wireless/b43legacy/debugfs.h
+++ b/drivers/net/wireless/b43legacy/debugfs.h
@@ -47,7 +47,7 @@ struct b43legacy_dfsentry {
 	struct b43legacy_txstatus_log txstatlog;
 
 	/* Enabled/Disabled list for the dynamic debugging features. */
-	u32 dyn_debug[__B43legacy_NR_DYNDBG];
+	bool dyn_debug[__B43legacy_NR_DYNDBG];
 	/* Dentries for the dynamic debugging entries. */
 	struct dentry *dyn_debug_dentries[__B43legacy_NR_DYNDBG];
 };
diff --git a/drivers/net/wireless/iwlegacy/common.h b/drivers/net/wireless/iwlegacy/common.h
index 5b972798bdff..ce52cf114fde 100644
--- a/drivers/net/wireless/iwlegacy/common.h
+++ b/drivers/net/wireless/iwlegacy/common.h
@@ -1425,9 +1425,9 @@ struct il_priv {
 #endif				/* CONFIG_IWLEGACY_DEBUGFS */
 
 	struct work_struct txpower_work;
-	u32 disable_sens_cal;
-	u32 disable_chain_noise_cal;
-	u32 disable_tx_power_cal;
+	bool disable_sens_cal;
+	bool disable_chain_noise_cal;
+	bool disable_tx_power_cal;
 	struct work_struct run_time_calib_work;
 	struct timer_list stats_periodic;
 	struct timer_list watchdog;
diff --git a/drivers/net/wireless/iwlwifi/mvm/mvm.h b/drivers/net/wireless/iwlwifi/mvm/mvm.h
index 83273adfabdd..34769e3f5f52 100644
--- a/drivers/net/wireless/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/iwlwifi/mvm/mvm.h
@@ -664,7 +664,7 @@ struct iwl_mvm {
 	const struct iwl_fw_bcast_filter *bcast_filters;
 #ifdef CONFIG_IWLWIFI_DEBUGFS
 	struct {
-		u32 override; /* u32 for debugfs_create_bool */
+		bool override;
 		struct iwl_bcast_filter_cmd cmd;
 	} dbgfs_bcast_filtering;
 #endif
@@ -688,7 +688,7 @@ struct iwl_mvm {
 	bool disable_power_off;
 	bool disable_power_off_d3;
 
-	u32 scan_iter_notif_enabled; /* must be u32 for debugfs_create_bool */
+	bool scan_iter_notif_enabled;
 
 	struct debugfs_blob_wrapper nvm_hw_blob;
 	struct debugfs_blob_wrapper nvm_sw_blob;
@@ -742,7 +742,7 @@ struct iwl_mvm {
 	int n_nd_channels;
 	bool net_detect;
 #ifdef CONFIG_IWLWIFI_DEBUGFS
-	u32 d3_wake_sysassert; /* must be u32 for debugfs_create_bool */
+	bool d3_wake_sysassert;
 	bool d3_test_active;
 	bool store_d3_resume_sram;
 	void *d3_resume_sram;
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index f5d497989fcd..10145d37931f 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -18,6 +18,7 @@
  * driver.
  */
 
+#include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/list.h>
 #include <linux/module.h>
@@ -578,3 +579,23 @@ err:
 		kfree(desc);
 	}
 }
+
+/**
+ * of_msi_configure - Set the msi_domain field of a device
+ * @dev: device structure to associate with an MSI irq domain
+ * @np: device node for that device
+ */
+void of_msi_configure(struct device *dev, struct device_node *np)
+{
+	struct device_node *msi_np;
+	struct irq_domain *d;
+
+	msi_np = of_parse_phandle(np, "msi-parent", 0);
+	if (!msi_np)
+		return;
+
+	d = irq_find_matching_host(msi_np, DOMAIN_BUS_PLATFORM_MSI);
+	if (!d)
+		d = irq_find_host(msi_np);
+	dev_set_msi_domain(dev, d);
+}
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index ddf8e42c9367..8a002d6151f2 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -184,6 +184,7 @@ static struct platform_device *of_platform_device_create_pdata(
 	dev->dev.bus = &platform_bus_type;
 	dev->dev.platform_data = platform_data;
 	of_dma_configure(&dev->dev, dev->dev.of_node);
+	of_msi_configure(&dev->dev, dev->dev.of_node);
 
 	if (of_device_add(dev) != 0) {
 		of_dma_deconfigure(&dev->dev);
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 73e4af400a5a..be3f631c3f75 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_PCI_IOV) += iov.o
 #
 obj-$(CONFIG_ALPHA) += setup-irq.o
 obj-$(CONFIG_ARM) += setup-irq.o
+obj-$(CONFIG_ARM64) += setup-irq.o
 obj-$(CONFIG_UNICORE32) += setup-irq.o
 obj-$(CONFIG_SUPERH) += setup-irq.o
 obj-$(CONFIG_MIPS) += setup-irq.o
diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
index 1dfb567b3522..e90d132d3316 100644
--- a/drivers/pci/host/Kconfig
+++ b/drivers/pci/host/Kconfig
@@ -53,7 +53,7 @@ config PCI_RCAR_GEN2_PCIE
 
 config PCI_HOST_GENERIC
 	bool "Generic PCI host controller"
-	depends on ARM && OF
+	depends on (ARM || ARM64) && OF
 	help
 	  Say Y here if you want to support a simple generic PCI host
 	  controller, such as the one emulated by kvmtool.
@@ -89,11 +89,20 @@ config PCI_XGENE
 	depends on ARCH_XGENE
 	depends on OF
 	select PCIEPORTBUS
+	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
 	help
 	  Say Y here if you want internal PCI support on APM X-Gene SoC.
 	  There are 5 internal PCIe ports available. Each port is GEN3 capable
 	  and have varied lanes from x1 to x8.
 
+config PCI_XGENE_MSI
+	bool "X-Gene v1 PCIe MSI feature"
+	depends on PCI_XGENE && PCI_MSI
+	default y
+	help
+	  Say Y here if you want PCIe MSI support for the APM X-Gene v1 SoC.
+	  This MSI driver supports 5 PCIe ports on the APM X-Gene v1 SoC.
+
 config PCI_LAYERSCAPE
 	bool "Freescale Layerscape PCIe controller"
 	depends on OF && ARM
diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile
index f733b4e27642..1957431d3fcc 100644
--- a/drivers/pci/host/Makefile
+++ b/drivers/pci/host/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o
 obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone-dw.o pci-keystone.o
 obj-$(CONFIG_PCIE_XILINX) += pcie-xilinx.o
 obj-$(CONFIG_PCI_XGENE) += pci-xgene.o
+obj-$(CONFIG_PCI_XGENE_MSI) += pci-xgene-msi.o
 obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o
 obj-$(CONFIG_PCI_VERSATILE) += pci-versatile.o
 obj-$(CONFIG_PCIE_IPROC) += pcie-iproc.o
diff --git a/drivers/pci/host/pci-host-generic.c b/drivers/pci/host/pci-host-generic.c
index ba46e581db99..265dd25169bf 100644
--- a/drivers/pci/host/pci-host-generic.c
+++ b/drivers/pci/host/pci-host-generic.c
@@ -38,7 +38,16 @@ struct gen_pci_cfg_windows {
 	const struct gen_pci_cfg_bus_ops	*ops;
 };
 
+/*
+ * ARM pcibios functions expect the ARM struct pci_sys_data as the PCI
+ * sysdata.  Add pci_sys_data as the first element in struct gen_pci so
+ * that when we use a gen_pci pointer as sysdata, it is also a pointer to
+ * a struct pci_sys_data.
+ */
 struct gen_pci {
+#ifdef CONFIG_ARM
+	struct pci_sys_data			sys;
+#endif
 	struct pci_host_bridge			host;
 	struct gen_pci_cfg_windows		cfg;
 	struct list_head			resources;
@@ -48,8 +57,7 @@ static void __iomem *gen_pci_map_cfg_bus_cam(struct pci_bus *bus,
 					     unsigned int devfn,
 					     int where)
 {
-	struct pci_sys_data *sys = bus->sysdata;
-	struct gen_pci *pci = sys->private_data;
+	struct gen_pci *pci = bus->sysdata;
 	resource_size_t idx = bus->number - pci->cfg.bus_range->start;
 
 	return pci->cfg.win[idx] + ((devfn << 8) | where);
@@ -64,8 +72,7 @@ static void __iomem *gen_pci_map_cfg_bus_ecam(struct pci_bus *bus,
 					      unsigned int devfn,
 					      int where)
 {
-	struct pci_sys_data *sys = bus->sysdata;
-	struct gen_pci *pci = sys->private_data;
+	struct gen_pci *pci = bus->sysdata;
 	resource_size_t idx = bus->number - pci->cfg.bus_range->start;
 
 	return pci->cfg.win[idx] + ((devfn << 12) | where);
@@ -198,13 +205,6 @@ static int gen_pci_parse_map_cfg_windows(struct gen_pci *pci)
 	return 0;
 }
 
-static int gen_pci_setup(int nr, struct pci_sys_data *sys)
-{
-	struct gen_pci *pci = sys->private_data;
-	list_splice_init(&pci->resources, &sys->resources);
-	return 1;
-}
-
 static int gen_pci_probe(struct platform_device *pdev)
 {
 	int err;
@@ -214,13 +214,7 @@ static int gen_pci_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct device_node *np = dev->of_node;
 	struct gen_pci *pci = devm_kzalloc(dev, sizeof(*pci), GFP_KERNEL);
-	struct hw_pci hw = {
-		.nr_controllers	= 1,
-		.private_data	= (void **)&pci,
-		.setup		= gen_pci_setup,
-		.map_irq	= of_irq_parse_and_map_pci,
-		.ops		= &gen_pci_ops,
-	};
+	struct pci_bus *bus, *child;
 
 	if (!pci)
 		return -ENOMEM;
@@ -258,7 +252,27 @@ static int gen_pci_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	pci_common_init_dev(dev, &hw);
+	/* Do not reassign resources if probe only */
+	if (!pci_has_flag(PCI_PROBE_ONLY))
+		pci_add_flags(PCI_REASSIGN_ALL_RSRC | PCI_REASSIGN_ALL_BUS);
+
+	bus = pci_scan_root_bus(dev, 0, &gen_pci_ops, pci, &pci->resources);
+	if (!bus) {
+		dev_err(dev, "Scanning rootbus failed");
+		return -ENODEV;
+	}
+
+	pci_fixup_irqs(pci_common_swizzle, of_irq_parse_and_map_pci);
+
+	if (!pci_has_flag(PCI_PROBE_ONLY)) {
+		pci_bus_size_bridges(bus);
+		pci_bus_assign_resources(bus);
+
+		list_for_each_entry(child, &bus->children, node)
+			pcie_bus_configure_settings(child);
+	}
+
+	pci_bus_add_devices(bus);
 	return 0;
 }
 
diff --git a/drivers/pci/host/pci-xgene-msi.c b/drivers/pci/host/pci-xgene-msi.c
new file mode 100644
index 000000000000..ec5a14b3189b
--- /dev/null
+++ b/drivers/pci/host/pci-xgene-msi.c
@@ -0,0 +1,587 @@
+/*
+ * APM X-Gene MSI Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Author: Tanmay Inamdar <tinamdar@apm.com>
+ *	   Duc Dang <dhdang@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of_irq.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/of_pci.h>
+
+#define MSI_IR0			0x000000
+#define MSI_INT0		0x800000
+#define IDX_PER_GROUP		8
+#define IRQS_PER_IDX		16
+#define NR_HW_IRQS		16
+#define NR_MSI_VEC		(IDX_PER_GROUP * IRQS_PER_IDX * NR_HW_IRQS)
+
+struct xgene_msi_group {
+	struct xgene_msi	*msi;
+	int			gic_irq;
+	u32			msi_grp;
+};
+
+struct xgene_msi {
+	struct device_node	*node;
+	struct irq_domain	*inner_domain;
+	struct irq_domain	*msi_domain;
+	u64			msi_addr;
+	void __iomem		*msi_regs;
+	unsigned long		*bitmap;
+	struct mutex		bitmap_lock;
+	struct xgene_msi_group	*msi_groups;
+	int			num_cpus;
+};
+
+/* Global data */
+static struct xgene_msi xgene_msi_ctrl;
+
+static struct irq_chip xgene_msi_top_irq_chip = {
+	.name		= "X-Gene1 MSI",
+	.irq_enable	= pci_msi_unmask_irq,
+	.irq_disable	= pci_msi_mask_irq,
+	.irq_mask	= pci_msi_mask_irq,
+	.irq_unmask	= pci_msi_unmask_irq,
+};
+
+static struct  msi_domain_info xgene_msi_domain_info = {
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+		  MSI_FLAG_PCI_MSIX),
+	.chip	= &xgene_msi_top_irq_chip,
+};
+
+/*
+ * X-Gene v1 has 16 groups of MSI termination registers MSInIRx, where
+ * n is group number (0..F), x is index of registers in each group (0..7)
+ * The register layout is as follows:
+ * MSI0IR0			base_addr
+ * MSI0IR1			base_addr +  0x10000
+ * ...				...
+ * MSI0IR6			base_addr +  0x60000
+ * MSI0IR7			base_addr +  0x70000
+ * MSI1IR0			base_addr +  0x80000
+ * MSI1IR1			base_addr +  0x90000
+ * ...				...
+ * MSI1IR7			base_addr +  0xF0000
+ * MSI2IR0			base_addr + 0x100000
+ * ...				...
+ * MSIFIR0			base_addr + 0x780000
+ * MSIFIR1			base_addr + 0x790000
+ * ...				...
+ * MSIFIR7			base_addr + 0x7F0000
+ * MSIINT0			base_addr + 0x800000
+ * MSIINT1			base_addr + 0x810000
+ * ...				...
+ * MSIINTF			base_addr + 0x8F0000
+ *
+ * Each index register supports 16 MSI vectors (0..15) to generate interrupt.
+ * There are total 16 GIC IRQs assigned for these 16 groups of MSI termination
+ * registers.
+ *
+ * Each MSI termination group has 1 MSIINTn register (n is 0..15) to indicate
+ * the MSI pending status caused by 1 of its 8 index registers.
+ */
+
+/* MSInIRx read helper */
+static u32 xgene_msi_ir_read(struct xgene_msi *msi,
+				    u32 msi_grp, u32 msir_idx)
+{
+	return readl_relaxed(msi->msi_regs + MSI_IR0 +
+			      (msi_grp << 19) + (msir_idx << 16));
+}
+
+/* MSIINTn read helper */
+static u32 xgene_msi_int_read(struct xgene_msi *msi, u32 msi_grp)
+{
+	return readl_relaxed(msi->msi_regs + MSI_INT0 + (msi_grp << 16));
+}
+
+/*
+ * With 2048 MSI vectors supported, the MSI message can be constructed using
+ * following scheme:
+ * - Divide into 8 256-vector groups
+ *		Group 0: 0-255
+ *		Group 1: 256-511
+ *		Group 2: 512-767
+ *		...
+ *		Group 7: 1792-2047
+ * - Each 256-vector group is divided into 16 16-vector groups
+ *	As an example: 16 16-vector groups for 256-vector group 0-255 is
+ *		Group 0: 0-15
+ *		Group 1: 16-32
+ *		...
+ *		Group 15: 240-255
+ * - The termination address of MSI vector in 256-vector group n and 16-vector
+ *   group x is the address of MSIxIRn
+ * - The data for MSI vector in 16-vector group x is x
+ */
+static u32 hwirq_to_reg_set(unsigned long hwirq)
+{
+	return (hwirq / (NR_HW_IRQS * IRQS_PER_IDX));
+}
+
+static u32 hwirq_to_group(unsigned long hwirq)
+{
+	return (hwirq % NR_HW_IRQS);
+}
+
+static u32 hwirq_to_msi_data(unsigned long hwirq)
+{
+	return ((hwirq / NR_HW_IRQS) % IRQS_PER_IDX);
+}
+
+static void xgene_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct xgene_msi *msi = irq_data_get_irq_chip_data(data);
+	u32 reg_set = hwirq_to_reg_set(data->hwirq);
+	u32 group = hwirq_to_group(data->hwirq);
+	u64 target_addr = msi->msi_addr + (((8 * group) + reg_set) << 16);
+
+	msg->address_hi = upper_32_bits(target_addr);
+	msg->address_lo = lower_32_bits(target_addr);
+	msg->data = hwirq_to_msi_data(data->hwirq);
+}
+
+/*
+ * X-Gene v1 only has 16 MSI GIC IRQs for 2048 MSI vectors.  To maintain
+ * the expected behaviour of .set_affinity for each MSI interrupt, the 16
+ * MSI GIC IRQs are statically allocated to 8 X-Gene v1 cores (2 GIC IRQs
+ * for each core).  The MSI vector is moved fom 1 MSI GIC IRQ to another
+ * MSI GIC IRQ to steer its MSI interrupt to correct X-Gene v1 core.  As a
+ * consequence, the total MSI vectors that X-Gene v1 supports will be
+ * reduced to 256 (2048/8) vectors.
+ */
+static int hwirq_to_cpu(unsigned long hwirq)
+{
+	return (hwirq % xgene_msi_ctrl.num_cpus);
+}
+
+static unsigned long hwirq_to_canonical_hwirq(unsigned long hwirq)
+{
+	return (hwirq - hwirq_to_cpu(hwirq));
+}
+
+static int xgene_msi_set_affinity(struct irq_data *irqdata,
+				  const struct cpumask *mask, bool force)
+{
+	int target_cpu = cpumask_first(mask);
+	int curr_cpu;
+
+	curr_cpu = hwirq_to_cpu(irqdata->hwirq);
+	if (curr_cpu == target_cpu)
+		return IRQ_SET_MASK_OK_DONE;
+
+	/* Update MSI number to target the new CPU */
+	irqdata->hwirq = hwirq_to_canonical_hwirq(irqdata->hwirq) + target_cpu;
+
+	return IRQ_SET_MASK_OK;
+}
+
+static struct irq_chip xgene_msi_bottom_irq_chip = {
+	.name			= "MSI",
+	.irq_set_affinity       = xgene_msi_set_affinity,
+	.irq_compose_msi_msg	= xgene_compose_msi_msg,
+};
+
+static int xgene_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				  unsigned int nr_irqs, void *args)
+{
+	struct xgene_msi *msi = domain->host_data;
+	int msi_irq;
+
+	mutex_lock(&msi->bitmap_lock);
+
+	msi_irq = bitmap_find_next_zero_area(msi->bitmap, NR_MSI_VEC, 0,
+					     msi->num_cpus, 0);
+	if (msi_irq < NR_MSI_VEC)
+		bitmap_set(msi->bitmap, msi_irq, msi->num_cpus);
+	else
+		msi_irq = -ENOSPC;
+
+	mutex_unlock(&msi->bitmap_lock);
+
+	if (msi_irq < 0)
+		return msi_irq;
+
+	irq_domain_set_info(domain, virq, msi_irq,
+			    &xgene_msi_bottom_irq_chip, domain->host_data,
+			    handle_simple_irq, NULL, NULL);
+	set_irq_flags(virq, IRQF_VALID);
+
+	return 0;
+}
+
+static void xgene_irq_domain_free(struct irq_domain *domain,
+				  unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+	struct xgene_msi *msi = irq_data_get_irq_chip_data(d);
+	u32 hwirq;
+
+	mutex_lock(&msi->bitmap_lock);
+
+	hwirq = hwirq_to_canonical_hwirq(d->hwirq);
+	bitmap_clear(msi->bitmap, hwirq, msi->num_cpus);
+
+	mutex_unlock(&msi->bitmap_lock);
+
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops msi_domain_ops = {
+	.alloc  = xgene_irq_domain_alloc,
+	.free   = xgene_irq_domain_free,
+};
+
+static int xgene_allocate_domains(struct xgene_msi *msi)
+{
+	msi->inner_domain = irq_domain_add_linear(NULL, NR_MSI_VEC,
+						  &msi_domain_ops, msi);
+	if (!msi->inner_domain)
+		return -ENOMEM;
+
+	msi->msi_domain = pci_msi_create_irq_domain(msi->node,
+						    &xgene_msi_domain_info,
+						    msi->inner_domain);
+
+	if (!msi->msi_domain) {
+		irq_domain_remove(msi->inner_domain);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void xgene_free_domains(struct xgene_msi *msi)
+{
+	if (msi->msi_domain)
+		irq_domain_remove(msi->msi_domain);
+	if (msi->inner_domain)
+		irq_domain_remove(msi->inner_domain);
+}
+
+static int xgene_msi_init_allocator(struct xgene_msi *xgene_msi)
+{
+	int size = BITS_TO_LONGS(NR_MSI_VEC) * sizeof(long);
+
+	xgene_msi->bitmap = kzalloc(size, GFP_KERNEL);
+	if (!xgene_msi->bitmap)
+		return -ENOMEM;
+
+	mutex_init(&xgene_msi->bitmap_lock);
+
+	xgene_msi->msi_groups = kcalloc(NR_HW_IRQS,
+					sizeof(struct xgene_msi_group),
+					GFP_KERNEL);
+	if (!xgene_msi->msi_groups)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void xgene_msi_isr(unsigned int irq, struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	struct xgene_msi_group *msi_groups;
+	struct xgene_msi *xgene_msi;
+	unsigned int virq;
+	int msir_index, msir_val, hw_irq;
+	u32 intr_index, grp_select, msi_grp;
+
+	chained_irq_enter(chip, desc);
+
+	msi_groups = irq_desc_get_handler_data(desc);
+	xgene_msi = msi_groups->msi;
+	msi_grp = msi_groups->msi_grp;
+
+	/*
+	 * MSIINTn (n is 0..F) indicates if there is a pending MSI interrupt
+	 * If bit x of this register is set (x is 0..7), one or more interupts
+	 * corresponding to MSInIRx is set.
+	 */
+	grp_select = xgene_msi_int_read(xgene_msi, msi_grp);
+	while (grp_select) {
+		msir_index = ffs(grp_select) - 1;
+		/*
+		 * Calculate MSInIRx address to read to check for interrupts
+		 * (refer to termination address and data assignment
+		 * described in xgene_compose_msi_msg() )
+		 */
+		msir_val = xgene_msi_ir_read(xgene_msi, msi_grp, msir_index);
+		while (msir_val) {
+			intr_index = ffs(msir_val) - 1;
+			/*
+			 * Calculate MSI vector number (refer to the termination
+			 * address and data assignment described in
+			 * xgene_compose_msi_msg function)
+			 */
+			hw_irq = (((msir_index * IRQS_PER_IDX) + intr_index) *
+				 NR_HW_IRQS) + msi_grp;
+			/*
+			 * As we have multiple hw_irq that maps to single MSI,
+			 * always look up the virq using the hw_irq as seen from
+			 * CPU0
+			 */
+			hw_irq = hwirq_to_canonical_hwirq(hw_irq);
+			virq = irq_find_mapping(xgene_msi->inner_domain, hw_irq);
+			WARN_ON(!virq);
+			if (virq != 0)
+				generic_handle_irq(virq);
+			msir_val &= ~(1 << intr_index);
+		}
+		grp_select &= ~(1 << msir_index);
+
+		if (!grp_select) {
+			/*
+			 * We handled all interrupts happened in this group,
+			 * resample this group MSI_INTx register in case
+			 * something else has been made pending in the meantime
+			 */
+			grp_select = xgene_msi_int_read(xgene_msi, msi_grp);
+		}
+	}
+
+	chained_irq_exit(chip, desc);
+}
+
+static int xgene_msi_remove(struct platform_device *pdev)
+{
+	int virq, i;
+	struct xgene_msi *msi = platform_get_drvdata(pdev);
+
+	for (i = 0; i < NR_HW_IRQS; i++) {
+		virq = msi->msi_groups[i].gic_irq;
+		if (virq != 0) {
+			irq_set_chained_handler(virq, NULL);
+			irq_set_handler_data(virq, NULL);
+		}
+	}
+	kfree(msi->msi_groups);
+
+	kfree(msi->bitmap);
+	msi->bitmap = NULL;
+
+	xgene_free_domains(msi);
+
+	return 0;
+}
+
+static int xgene_msi_hwirq_alloc(unsigned int cpu)
+{
+	struct xgene_msi *msi = &xgene_msi_ctrl;
+	struct xgene_msi_group *msi_group;
+	cpumask_var_t mask;
+	int i;
+	int err;
+
+	for (i = cpu; i < NR_HW_IRQS; i += msi->num_cpus) {
+		msi_group = &msi->msi_groups[i];
+		if (!msi_group->gic_irq)
+			continue;
+
+		irq_set_chained_handler(msi_group->gic_irq,
+					xgene_msi_isr);
+		err = irq_set_handler_data(msi_group->gic_irq, msi_group);
+		if (err) {
+			pr_err("failed to register GIC IRQ handler\n");
+			return -EINVAL;
+		}
+		/*
+		 * Statically allocate MSI GIC IRQs to each CPU core.
+		 * With 8-core X-Gene v1, 2 MSI GIC IRQs are allocated
+		 * to each core.
+		 */
+		if (alloc_cpumask_var(&mask, GFP_KERNEL)) {
+			cpumask_clear(mask);
+			cpumask_set_cpu(cpu, mask);
+			err = irq_set_affinity(msi_group->gic_irq, mask);
+			if (err)
+				pr_err("failed to set affinity for GIC IRQ");
+			free_cpumask_var(mask);
+		} else {
+			pr_err("failed to alloc CPU mask for affinity\n");
+			err = -EINVAL;
+		}
+
+		if (err) {
+			irq_set_chained_handler(msi_group->gic_irq, NULL);
+			irq_set_handler_data(msi_group->gic_irq, NULL);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static void xgene_msi_hwirq_free(unsigned int cpu)
+{
+	struct xgene_msi *msi = &xgene_msi_ctrl;
+	struct xgene_msi_group *msi_group;
+	int i;
+
+	for (i = cpu; i < NR_HW_IRQS; i += msi->num_cpus) {
+		msi_group = &msi->msi_groups[i];
+		if (!msi_group->gic_irq)
+			continue;
+
+		irq_set_chained_handler(msi_group->gic_irq, NULL);
+		irq_set_handler_data(msi_group->gic_irq, NULL);
+	}
+}
+
+static int xgene_msi_cpu_callback(struct notifier_block *nfb,
+				  unsigned long action, void *hcpu)
+{
+	unsigned cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		xgene_msi_hwirq_alloc(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		xgene_msi_hwirq_free(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block xgene_msi_cpu_notifier = {
+	.notifier_call = xgene_msi_cpu_callback,
+};
+
+static const struct of_device_id xgene_msi_match_table[] = {
+	{.compatible = "apm,xgene1-msi"},
+	{},
+};
+
+static int xgene_msi_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	int rc, irq_index;
+	struct xgene_msi *xgene_msi;
+	unsigned int cpu;
+	int virt_msir;
+	u32 msi_val, msi_idx;
+
+	xgene_msi = &xgene_msi_ctrl;
+
+	platform_set_drvdata(pdev, xgene_msi);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	xgene_msi->msi_regs = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(xgene_msi->msi_regs)) {
+		dev_err(&pdev->dev, "no reg space\n");
+		rc = -EINVAL;
+		goto error;
+	}
+	xgene_msi->msi_addr = res->start;
+	xgene_msi->node = pdev->dev.of_node;
+	xgene_msi->num_cpus = num_possible_cpus();
+
+	rc = xgene_msi_init_allocator(xgene_msi);
+	if (rc) {
+		dev_err(&pdev->dev, "Error allocating MSI bitmap\n");
+		goto error;
+	}
+
+	rc = xgene_allocate_domains(xgene_msi);
+	if (rc) {
+		dev_err(&pdev->dev, "Failed to allocate MSI domain\n");
+		goto error;
+	}
+
+	for (irq_index = 0; irq_index < NR_HW_IRQS; irq_index++) {
+		virt_msir = platform_get_irq(pdev, irq_index);
+		if (virt_msir < 0) {
+			dev_err(&pdev->dev, "Cannot translate IRQ index %d\n",
+				irq_index);
+			rc = -EINVAL;
+			goto error;
+		}
+		xgene_msi->msi_groups[irq_index].gic_irq = virt_msir;
+		xgene_msi->msi_groups[irq_index].msi_grp = irq_index;
+		xgene_msi->msi_groups[irq_index].msi = xgene_msi;
+	}
+
+	/*
+	 * MSInIRx registers are read-to-clear; before registering
+	 * interrupt handlers, read all of them to clear spurious
+	 * interrupts that may occur before the driver is probed.
+	 */
+	for (irq_index = 0; irq_index < NR_HW_IRQS; irq_index++) {
+		for (msi_idx = 0; msi_idx < IDX_PER_GROUP; msi_idx++)
+			msi_val = xgene_msi_ir_read(xgene_msi, irq_index,
+						    msi_idx);
+		/* Read MSIINTn to confirm */
+		msi_val = xgene_msi_int_read(xgene_msi, irq_index);
+		if (msi_val) {
+			dev_err(&pdev->dev, "Failed to clear spurious IRQ\n");
+			rc = -EINVAL;
+			goto error;
+		}
+	}
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu)
+		if (xgene_msi_hwirq_alloc(cpu)) {
+			dev_err(&pdev->dev, "failed to register MSI handlers\n");
+			cpu_notifier_register_done();
+			goto error;
+		}
+
+	rc = __register_hotcpu_notifier(&xgene_msi_cpu_notifier);
+	if (rc) {
+		dev_err(&pdev->dev, "failed to add CPU MSI notifier\n");
+		cpu_notifier_register_done();
+		goto error;
+	}
+
+	cpu_notifier_register_done();
+
+	dev_info(&pdev->dev, "APM X-Gene PCIe MSI driver loaded\n");
+
+	return 0;
+
+error:
+	xgene_msi_remove(pdev);
+	return rc;
+}
+
+static struct platform_driver xgene_msi_driver = {
+	.driver = {
+		.name = "xgene-msi",
+		.owner = THIS_MODULE,
+		.of_match_table = xgene_msi_match_table,
+	},
+	.probe = xgene_msi_probe,
+	.remove = xgene_msi_remove,
+};
+
+static int __init xgene_pcie_msi_init(void)
+{
+	return platform_driver_register(&xgene_msi_driver);
+}
+subsys_initcall(xgene_pcie_msi_init);
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index ee082c0366ec..3e5a636c9a9a 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -468,6 +468,23 @@ static int xgene_pcie_setup(struct xgene_pcie_port *port,
 	return 0;
 }
 
+static int xgene_pcie_msi_enable(struct pci_bus *bus)
+{
+	struct device_node *msi_node;
+
+	msi_node = of_parse_phandle(bus->dev.of_node,
+					"msi-parent", 0);
+	if (!msi_node)
+		return -ENODEV;
+
+	bus->msi = of_pci_find_msi_chip_by_node(msi_node);
+	if (!bus->msi)
+		return -ENODEV;
+
+	bus->msi->dev = &bus->dev;
+	return 0;
+}
+
 static int xgene_pcie_probe_bridge(struct platform_device *pdev)
 {
 	struct device_node *dn = pdev->dev.of_node;
@@ -504,6 +521,10 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev)
 	if (!bus)
 		return -ENOMEM;
 
+	if (IS_ENABLED(CONFIG_PCI_MSI))
+		if (xgene_pcie_msi_enable(bus))
+			dev_info(port->dev, "failed to enable MSI\n");
+
 	pci_scan_child_bus(bus);
 	pci_assign_unassigned_bus_resources(bus);
 	pci_bus_add_devices(bus);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index c3e7dfcf9ff5..fc54038495da 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -39,14 +39,13 @@ struct irq_domain * __weak arch_get_pci_msi_domain(struct pci_dev *dev)
 
 static struct irq_domain *pci_msi_get_domain(struct pci_dev *dev)
 {
-	struct irq_domain *domain = NULL;
+	struct irq_domain *domain;
 
-	if (dev->bus->msi)
-		domain = dev->bus->msi->domain;
-	if (!domain)
-		domain = arch_get_pci_msi_domain(dev);
+	domain = dev_get_msi_domain(&dev->dev);
+	if (domain)
+		return domain;
 
-	return domain;
+	return arch_get_pci_msi_domain(dev);
 }
 
 static int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
@@ -1302,12 +1301,19 @@ struct irq_domain *pci_msi_create_irq_domain(struct device_node *node,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent)
 {
+	struct irq_domain *domain;
+
 	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
 		pci_msi_domain_update_dom_ops(info);
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		pci_msi_domain_update_chip_ops(info);
 
-	return msi_create_irq_domain(node, info, parent);
+	domain = msi_create_irq_domain(node, info, parent);
+	if (!domain)
+		return NULL;
+
+	domain->bus_token = DOMAIN_BUS_PCI_MSI;
+	return domain;
 }
 
 /**
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index f0929934bb7a..2e99a500cb83 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/irqdomain.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/of.h>
@@ -59,3 +60,32 @@ struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus)
 		return of_node_get(bus->bridge->parent->of_node);
 	return NULL;
 }
+
+struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus)
+{
+#ifdef CONFIG_IRQ_DOMAIN
+	struct device_node *np;
+	struct irq_domain *d;
+
+	if (!bus->dev.of_node)
+		return NULL;
+
+	/* Start looking for a phandle to an MSI controller. */
+	np = of_parse_phandle(bus->dev.of_node, "msi-parent", 0);
+
+	/*
+	 * If we don't have an msi-parent property, look for a domain
+	 * directly attached to the host bridge.
+	 */
+	if (!np)
+		np = bus->dev.of_node;
+
+	d = irq_find_matching_host(np, DOMAIN_BUS_PCI_MSI);
+	if (d)
+		return d;
+
+	return irq_find_host(np);
+#else
+	return NULL;
+#endif
+}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 3c4e709cf9a1..9e3ccfc0c3ce 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -664,6 +664,35 @@ static void pci_set_bus_speed(struct pci_bus *bus)
 	}
 }
 
+static struct irq_domain *pci_host_bridge_msi_domain(struct pci_bus *bus)
+{
+	struct irq_domain *d;
+
+	/*
+	 * Any firmware interface that can resolve the msi_domain
+	 * should be called from here.
+	 */
+	d = pci_host_bridge_of_msi_domain(bus);
+
+	return d;
+}
+
+static void pci_set_bus_msi_domain(struct pci_bus *bus)
+{
+	struct irq_domain *d;
+
+	/*
+	 * Either bus is the root, and we must obtain it from the
+	 * firmware, or we inherit it from the bridge device.
+	 */
+	if (pci_is_root_bus(bus))
+		d = pci_host_bridge_msi_domain(bus);
+	else
+		d = dev_get_msi_domain(&bus->self->dev);
+
+	dev_set_msi_domain(&bus->dev, d);
+}
+
 static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
 					   struct pci_dev *bridge, int busnr)
 {
@@ -717,6 +746,7 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
 	bridge->subordinate = child;
 
 add_dev:
+	pci_set_bus_msi_domain(child);
 	ret = device_register(&child->dev);
 	WARN_ON(ret < 0);
 
@@ -1542,6 +1572,17 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	pci_enable_acs(dev);
 }
 
+static void pci_set_msi_domain(struct pci_dev *dev)
+{
+	/*
+	 * If no domain has been set through the pcibios_add_device
+	 * callback, inherit the default from the bus device.
+	 */
+	if (!dev_get_msi_domain(&dev->dev))
+		dev_set_msi_domain(&dev->dev,
+				   dev_get_msi_domain(&dev->bus->dev));
+}
+
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 {
 	int ret;
@@ -1583,6 +1624,9 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	ret = pcibios_add_device(dev);
 	WARN_ON(ret < 0);
 
+	/* Setup MSI irq domain */
+	pci_set_msi_domain(dev);
+
 	/* Notifier could use PCI capabilities */
 	dev->match_driver = false;
 	ret = device_add(&dev->dev);
@@ -1973,6 +2017,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 	b->bridge = get_device(&bridge->dev);
 	device_enable_async_suspend(b->bridge);
 	pci_set_bus_of_node(b);
+	pci_set_bus_msi_domain(b);
 
 	if (!parent)
 		set_dev_node(b->bridge, pcibus_to_node(b));
diff --git a/drivers/phy/phy-rcar-gen2.c b/drivers/phy/phy-rcar-gen2.c
index 97d45f47d1ad..8b093d02fc1b 100644
--- a/drivers/phy/phy-rcar-gen2.c
+++ b/drivers/phy/phy-rcar-gen2.c
@@ -17,8 +17,7 @@
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
 #include <linux/spinlock.h>
-
-#include <asm/cmpxchg.h>
+#include <linux/atomic.h>
 
 #define USBHS_LPSTS			0x02
 #define USBHS_UGCTRL			0x80
diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c
index 594c918b553d..1ef02daddb60 100644
--- a/drivers/platform/x86/acerhdf.c
+++ b/drivers/platform/x86/acerhdf.c
@@ -372,7 +372,8 @@ static int acerhdf_bind(struct thermal_zone_device *thermal,
 		return 0;
 
 	if (thermal_zone_bind_cooling_device(thermal, 0, cdev,
-			THERMAL_NO_LIMIT, THERMAL_NO_LIMIT)) {
+			THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+			THERMAL_WEIGHT_DEFAULT)) {
 		pr_err("error binding cooling dev\n");
 		return -EINVAL;
 	}
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index d72605864b0a..14562788e4e0 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -55,9 +55,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	if (PagePrivate(page))
 		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
 
-	if (TestClearPageDirty(page))
-		account_page_cleaned(page, mapping);
-
+	cancel_dirty_page(page);
 	ClearPageMappedToDisk(page);
 	ll_delete_from_page_cache(page);
 }
diff --git a/drivers/staging/speakup/selection.c b/drivers/staging/speakup/selection.c
index ed68b2cfe031..b62fe0961b0b 100644
--- a/drivers/staging/speakup/selection.c
+++ b/drivers/staging/speakup/selection.c
@@ -7,7 +7,7 @@
 #include <linux/workqueue.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
-#include <asm/cmpxchg.h>
+#include <linux/atomic.h>
 
 #include "speakup.h"
 
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index af40db0df58e..0ae6bbf0a89b 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -42,6 +42,17 @@ config THERMAL_OF
 	  Say 'Y' here if you need to build thermal infrastructure
 	  based on device tree.
 
+config THERMAL_WRITABLE_TRIPS
+	bool "Enable writable trip points"
+	help
+	  This option allows the system integrator to choose whether
+	  trip temperatures can be changed from userspace. The
+	  writable trips need to be specified when setting up the
+	  thermal zone but the choice here takes precedence.
+
+	  Say 'Y' here if you would like to allow userspace tools to
+	  change trip temperatures.
+
 choice
 	prompt "Default Thermal governor"
 	default THERMAL_DEFAULT_GOV_STEP_WISE
@@ -71,6 +82,14 @@ config THERMAL_DEFAULT_GOV_USER_SPACE
 	  Select this if you want to let the user space manage the
 	  platform thermals.
 
+config THERMAL_DEFAULT_GOV_POWER_ALLOCATOR
+	bool "power_allocator"
+	select THERMAL_GOV_POWER_ALLOCATOR
+	help
+	  Select this if you want to control temperature based on
+	  system and device power allocation. This governor can only
+	  operate on cooling devices that implement the power API.
+
 endchoice
 
 config THERMAL_GOV_FAIR_SHARE
@@ -99,6 +118,13 @@ config THERMAL_GOV_USER_SPACE
 	help
 	  Enable this to let the user space manage the platform thermals.
 
+config THERMAL_GOV_POWER_ALLOCATOR
+	bool "Power allocator thermal governor"
+	select THERMAL_POWER_ACTOR
+	help
+	  Enable this to manage platform thermals by dynamically
+	  allocating and limiting power to devices.
+
 config CPU_THERMAL
 	bool "generic cpu cooling support"
 	depends on CPU_FREQ
@@ -122,6 +148,20 @@ config CLOCK_THERMAL
 	  device that is configured to use this cooling mechanism will be
 	  controlled to reduce clock frequency whenever temperature is high.
 
+config DEVFREQ_THERMAL
+	bool "Generic device cooling support"
+	depends on PM_DEVFREQ
+	depends on PM_OPP
+	help
+	  This implements the generic devfreq cooling mechanism through
+	  frequency reduction for devices using devfreq.
+
+	  This will throttle the device by limiting the maximum allowed DVFS
+	  frequency corresponding to the cooling level.
+
+	  In order to use the power extensions of the cooling device,
+	  devfreq should use the simple_ondemand governor.
+
 	  If you want this support, you should say Y here.
 
 config THERMAL_EMULATION
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index fa0dc486790f..cb8d8a5961e3 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -14,6 +14,7 @@ thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE)	+= fair_share.o
 thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG)	+= gov_bang_bang.o
 thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE)	+= step_wise.o
 thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE)	+= user_space.o
+thermal_sys-$(CONFIG_THERMAL_GOV_POWER_ALLOCATOR)	+= power_allocator.o
 
 # cpufreq cooling
 thermal_sys-$(CONFIG_CPU_THERMAL)	+= cpu_cooling.o
@@ -21,6 +22,9 @@ thermal_sys-$(CONFIG_CPU_THERMAL)	+= cpu_cooling.o
 # clock cooling
 thermal_sys-$(CONFIG_CLOCK_THERMAL)	+= clock_cooling.o
 
+# devfreq cooling
+thermal_sys-$(CONFIG_DEVFREQ_THERMAL) += devfreq_cooling.o
+
 # platform thermal drivers
 obj-$(CONFIG_SPEAR_THERMAL)	+= spear_thermal.o
 obj-$(CONFIG_ROCKCHIP_THERMAL)	+= rockchip_thermal.o
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index 5db5e91fd12f..54c18b2b91e0 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -26,10 +26,13 @@
 #include <linux/thermal.h>
 #include <linux/cpufreq.h>
 #include <linux/err.h>
+#include <linux/pm_opp.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/cpu_cooling.h>
 
+#include <trace/events/thermal.h>
+
 /*
  * Cooling state <-> CPUFreq frequency
  *
@@ -45,6 +48,19 @@
  */
 
 /**
+ * struct power_table - frequency to power conversion
+ * @frequency:	frequency in KHz
+ * @power:	power in mW
+ *
+ * This structure is built when the cooling device registers and helps
+ * in translating frequency to power and viceversa.
+ */
+struct power_table {
+	u32 frequency;
+	u32 power;
+};
+
+/**
  * struct cpufreq_cooling_device - data for cooling device with cpufreq
  * @id: unique integer value corresponding to each cpufreq_cooling_device
  *	registered.
@@ -58,6 +74,15 @@
  *	cpufreq frequencies.
  * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device.
  * @node: list_head to link all cpufreq_cooling_device together.
+ * @last_load: load measured by the latest call to cpufreq_get_actual_power()
+ * @time_in_idle: previous reading of the absolute time that this cpu was idle
+ * @time_in_idle_timestamp: wall time of the last invocation of
+ *	get_cpu_idle_time_us()
+ * @dyn_power_table: array of struct power_table for frequency to power
+ *	conversion, sorted in ascending order.
+ * @dyn_power_table_entries: number of entries in the @dyn_power_table array
+ * @cpu_dev: the first cpu_device from @allowed_cpus that has OPPs registered
+ * @plat_get_static_power: callback to calculate the static power
  *
  * This structure is required for keeping information of each registered
  * cpufreq_cooling_device.
@@ -71,6 +96,13 @@ struct cpufreq_cooling_device {
 	unsigned int *freq_table;	/* In descending order */
 	struct cpumask allowed_cpus;
 	struct list_head node;
+	u32 last_load;
+	u64 *time_in_idle;
+	u64 *time_in_idle_timestamp;
+	struct power_table *dyn_power_table;
+	int dyn_power_table_entries;
+	struct device *cpu_dev;
+	get_static_t plat_get_static_power;
 };
 static DEFINE_IDR(cpufreq_idr);
 static DEFINE_MUTEX(cooling_cpufreq_lock);
@@ -186,23 +218,237 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
 	unsigned long max_freq = 0;
 	struct cpufreq_cooling_device *cpufreq_dev;
 
-	if (event != CPUFREQ_ADJUST)
-		return 0;
+	switch (event) {
 
-	mutex_lock(&cooling_cpufreq_lock);
-	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
-		if (!cpumask_test_cpu(policy->cpu,
-					&cpufreq_dev->allowed_cpus))
+	case CPUFREQ_ADJUST:
+		mutex_lock(&cooling_cpufreq_lock);
+		list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
+			if (!cpumask_test_cpu(policy->cpu,
+					      &cpufreq_dev->allowed_cpus))
+				continue;
+
+			max_freq = cpufreq_dev->clipped_freq;
+
+			if (policy->max != max_freq)
+				cpufreq_verify_within_limits(policy, 0,
+							     max_freq);
+		}
+		mutex_unlock(&cooling_cpufreq_lock);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+/**
+ * build_dyn_power_table() - create a dynamic power to frequency table
+ * @cpufreq_device:	the cpufreq cooling device in which to store the table
+ * @capacitance: dynamic power coefficient for these cpus
+ *
+ * Build a dynamic power to frequency table for this cpu and store it
+ * in @cpufreq_device.  This table will be used in cpu_power_to_freq() and
+ * cpu_freq_to_power() to convert between power and frequency
+ * efficiently.  Power is stored in mW, frequency in KHz.  The
+ * resulting table is in ascending order.
+ *
+ * Return: 0 on success, -E* on error.
+ */
+static int build_dyn_power_table(struct cpufreq_cooling_device *cpufreq_device,
+				 u32 capacitance)
+{
+	struct power_table *power_table;
+	struct dev_pm_opp *opp;
+	struct device *dev = NULL;
+	int num_opps = 0, cpu, i, ret = 0;
+	unsigned long freq;
+
+	rcu_read_lock();
+
+	for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
+		dev = get_cpu_device(cpu);
+		if (!dev) {
+			dev_warn(&cpufreq_device->cool_dev->device,
+				 "No cpu device for cpu %d\n", cpu);
 			continue;
+		}
+
+		num_opps = dev_pm_opp_get_opp_count(dev);
+		if (num_opps > 0) {
+			break;
+		} else if (num_opps < 0) {
+			ret = num_opps;
+			goto unlock;
+		}
+	}
 
-		max_freq = cpufreq_dev->clipped_freq;
+	if (num_opps == 0) {
+		ret = -EINVAL;
+		goto unlock;
+	}
 
-		if (policy->max != max_freq)
-			cpufreq_verify_within_limits(policy, 0, max_freq);
+	power_table = kcalloc(num_opps, sizeof(*power_table), GFP_KERNEL);
+	if (!power_table) {
+		ret = -ENOMEM;
+		goto unlock;
 	}
-	mutex_unlock(&cooling_cpufreq_lock);
 
-	return 0;
+	for (freq = 0, i = 0;
+	     opp = dev_pm_opp_find_freq_ceil(dev, &freq), !IS_ERR(opp);
+	     freq++, i++) {
+		u32 freq_mhz, voltage_mv;
+		u64 power;
+
+		freq_mhz = freq / 1000000;
+		voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
+
+		/*
+		 * Do the multiplication with MHz and millivolt so as
+		 * to not overflow.
+		 */
+		power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
+		do_div(power, 1000000000);
+
+		/* frequency is stored in power_table in KHz */
+		power_table[i].frequency = freq / 1000;
+
+		/* power is stored in mW */
+		power_table[i].power = power;
+	}
+
+	if (i == 0) {
+		ret = PTR_ERR(opp);
+		goto unlock;
+	}
+
+	cpufreq_device->cpu_dev = dev;
+	cpufreq_device->dyn_power_table = power_table;
+	cpufreq_device->dyn_power_table_entries = i;
+
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
+static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_device,
+			     u32 freq)
+{
+	int i;
+	struct power_table *pt = cpufreq_device->dyn_power_table;
+
+	for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
+		if (freq < pt[i].frequency)
+			break;
+
+	return pt[i - 1].power;
+}
+
+static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_device,
+			     u32 power)
+{
+	int i;
+	struct power_table *pt = cpufreq_device->dyn_power_table;
+
+	for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
+		if (power < pt[i].power)
+			break;
+
+	return pt[i - 1].frequency;
+}
+
+/**
+ * get_load() - get load for a cpu since last updated
+ * @cpufreq_device:	&struct cpufreq_cooling_device for this cpu
+ * @cpu:	cpu number
+ *
+ * Return: The average load of cpu @cpu in percentage since this
+ * function was last called.
+ */
+static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu)
+{
+	u32 load;
+	u64 now, now_idle, delta_time, delta_idle;
+
+	now_idle = get_cpu_idle_time(cpu, &now, 0);
+	delta_idle = now_idle - cpufreq_device->time_in_idle[cpu];
+	delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu];
+
+	if (delta_time <= delta_idle)
+		load = 0;
+	else
+		load = div64_u64(100 * (delta_time - delta_idle), delta_time);
+
+	cpufreq_device->time_in_idle[cpu] = now_idle;
+	cpufreq_device->time_in_idle_timestamp[cpu] = now;
+
+	return load;
+}
+
+/**
+ * get_static_power() - calculate the static power consumed by the cpus
+ * @cpufreq_device:	struct &cpufreq_cooling_device for this cpu cdev
+ * @tz:		thermal zone device in which we're operating
+ * @freq:	frequency in KHz
+ * @power:	pointer in which to store the calculated static power
+ *
+ * Calculate the static power consumed by the cpus described by
+ * @cpu_actor running at frequency @freq.  This function relies on a
+ * platform specific function that should have been provided when the
+ * actor was registered.  If it wasn't, the static power is assumed to
+ * be negligible.  The calculated static power is stored in @power.
+ *
+ * Return: 0 on success, -E* on failure.
+ */
+static int get_static_power(struct cpufreq_cooling_device *cpufreq_device,
+			    struct thermal_zone_device *tz, unsigned long freq,
+			    u32 *power)
+{
+	struct dev_pm_opp *opp;
+	unsigned long voltage;
+	struct cpumask *cpumask = &cpufreq_device->allowed_cpus;
+	unsigned long freq_hz = freq * 1000;
+
+	if (!cpufreq_device->plat_get_static_power ||
+	    !cpufreq_device->cpu_dev) {
+		*power = 0;
+		return 0;
+	}
+
+	rcu_read_lock();
+
+	opp = dev_pm_opp_find_freq_exact(cpufreq_device->cpu_dev, freq_hz,
+					 true);
+	voltage = dev_pm_opp_get_voltage(opp);
+
+	rcu_read_unlock();
+
+	if (voltage == 0) {
+		dev_warn_ratelimited(cpufreq_device->cpu_dev,
+				     "Failed to get voltage for frequency %lu: %ld\n",
+				     freq_hz, IS_ERR(opp) ? PTR_ERR(opp) : 0);
+		return -EINVAL;
+	}
+
+	return cpufreq_device->plat_get_static_power(cpumask, tz->passive_delay,
+						     voltage, power);
+}
+
+/**
+ * get_dynamic_power() - calculate the dynamic power
+ * @cpufreq_device:	&cpufreq_cooling_device for this cdev
+ * @freq:	current frequency
+ *
+ * Return: the dynamic power consumed by the cpus described by
+ * @cpufreq_device.
+ */
+static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_device,
+			     unsigned long freq)
+{
+	u32 raw_cpu_power;
+
+	raw_cpu_power = cpu_freq_to_power(cpufreq_device, freq);
+	return (raw_cpu_power * cpufreq_device->last_load) / 100;
 }
 
 /* cpufreq cooling device callback functions are defined below */
@@ -280,8 +526,205 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 	return 0;
 }
 
+/**
+ * cpufreq_get_requested_power() - get the current power
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @power:	pointer in which to store the resulting power
+ *
+ * Calculate the current power consumption of the cpus in milliwatts
+ * and store it in @power.  This function should actually calculate
+ * the requested power, but it's hard to get the frequency that
+ * cpufreq would have assigned if there were no thermal limits.
+ * Instead, we calculate the current power on the assumption that the
+ * immediate future will look like the immediate past.
+ *
+ * We use the current frequency and the average load since this
+ * function was last called.  In reality, there could have been
+ * multiple opps since this function was last called and that affects
+ * the load calculation.  While it's not perfectly accurate, this
+ * simplification is good enough and works.  REVISIT this, as more
+ * complex code may be needed if experiments show that it's not
+ * accurate enough.
+ *
+ * Return: 0 on success, -E* if getting the static power failed.
+ */
+static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
+				       struct thermal_zone_device *tz,
+				       u32 *power)
+{
+	unsigned long freq;
+	int i = 0, cpu, ret;
+	u32 static_power, dynamic_power, total_load = 0;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+	u32 *load_cpu = NULL;
+
+	cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
+
+	/*
+	 * All the CPUs are offline, thus the requested power by
+	 * the cdev is 0
+	 */
+	if (cpu >= nr_cpu_ids) {
+		*power = 0;
+		return 0;
+	}
+
+	freq = cpufreq_quick_get(cpu);
+
+	if (trace_thermal_power_cpu_get_power_enabled()) {
+		u32 ncpus = cpumask_weight(&cpufreq_device->allowed_cpus);
+
+		load_cpu = devm_kcalloc(&cdev->device, ncpus, sizeof(*load_cpu),
+					GFP_KERNEL);
+	}
+
+	for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
+		u32 load;
+
+		if (cpu_online(cpu))
+			load = get_load(cpufreq_device, cpu);
+		else
+			load = 0;
+
+		total_load += load;
+		if (trace_thermal_power_cpu_limit_enabled() && load_cpu)
+			load_cpu[i] = load;
+
+		i++;
+	}
+
+	cpufreq_device->last_load = total_load;
+
+	dynamic_power = get_dynamic_power(cpufreq_device, freq);
+	ret = get_static_power(cpufreq_device, tz, freq, &static_power);
+	if (ret) {
+		if (load_cpu)
+			devm_kfree(&cdev->device, load_cpu);
+		return ret;
+	}
+
+	if (load_cpu) {
+		trace_thermal_power_cpu_get_power(
+			&cpufreq_device->allowed_cpus,
+			freq, load_cpu, i, dynamic_power, static_power);
+
+		devm_kfree(&cdev->device, load_cpu);
+	}
+
+	*power = static_power + dynamic_power;
+	return 0;
+}
+
+/**
+ * cpufreq_state2power() - convert a cpu cdev state to power consumed
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @state:	cooling device state to be converted
+ * @power:	pointer in which to store the resulting power
+ *
+ * Convert cooling device state @state into power consumption in
+ * milliwatts assuming 100% load.  Store the calculated power in
+ * @power.
+ *
+ * Return: 0 on success, -EINVAL if the cooling device state could not
+ * be converted into a frequency or other -E* if there was an error
+ * when calculating the static power.
+ */
+static int cpufreq_state2power(struct thermal_cooling_device *cdev,
+			       struct thermal_zone_device *tz,
+			       unsigned long state, u32 *power)
+{
+	unsigned int freq, num_cpus;
+	cpumask_t cpumask;
+	u32 static_power, dynamic_power;
+	int ret;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+
+	cpumask_and(&cpumask, &cpufreq_device->allowed_cpus, cpu_online_mask);
+	num_cpus = cpumask_weight(&cpumask);
+
+	/* None of our cpus are online, so no power */
+	if (num_cpus == 0) {
+		*power = 0;
+		return 0;
+	}
+
+	freq = cpufreq_device->freq_table[state];
+	if (!freq)
+		return -EINVAL;
+
+	dynamic_power = cpu_freq_to_power(cpufreq_device, freq) * num_cpus;
+	ret = get_static_power(cpufreq_device, tz, freq, &static_power);
+	if (ret)
+		return ret;
+
+	*power = static_power + dynamic_power;
+	return 0;
+}
+
+/**
+ * cpufreq_power2state() - convert power to a cooling device state
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @power:	power in milliwatts to be converted
+ * @state:	pointer in which to store the resulting state
+ *
+ * Calculate a cooling device state for the cpus described by @cdev
+ * that would allow them to consume at most @power mW and store it in
+ * @state.  Note that this calculation depends on external factors
+ * such as the cpu load or the current static power.  Calling this
+ * function with the same power as input can yield different cooling
+ * device states depending on those external factors.
+ *
+ * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if
+ * the calculated frequency could not be converted to a valid state.
+ * The latter should not happen unless the frequencies available to
+ * cpufreq have changed since the initialization of the cpu cooling
+ * device.
+ */
+static int cpufreq_power2state(struct thermal_cooling_device *cdev,
+			       struct thermal_zone_device *tz, u32 power,
+			       unsigned long *state)
+{
+	unsigned int cpu, cur_freq, target_freq;
+	int ret;
+	s32 dyn_power;
+	u32 last_load, normalised_power, static_power;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+
+	cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
+
+	/* None of our cpus are online */
+	if (cpu >= nr_cpu_ids)
+		return -ENODEV;
+
+	cur_freq = cpufreq_quick_get(cpu);
+	ret = get_static_power(cpufreq_device, tz, cur_freq, &static_power);
+	if (ret)
+		return ret;
+
+	dyn_power = power - static_power;
+	dyn_power = dyn_power > 0 ? dyn_power : 0;
+	last_load = cpufreq_device->last_load ?: 1;
+	normalised_power = (dyn_power * 100) / last_load;
+	target_freq = cpu_power_to_freq(cpufreq_device, normalised_power);
+
+	*state = cpufreq_cooling_get_level(cpu, target_freq);
+	if (*state == THERMAL_CSTATE_INVALID) {
+		dev_warn_ratelimited(&cdev->device,
+				     "Failed to convert %dKHz for cpu %d into a cdev state\n",
+				     target_freq, cpu);
+		return -EINVAL;
+	}
+
+	trace_thermal_power_cpu_limit(&cpufreq_device->allowed_cpus,
+				      target_freq, *state, power);
+	return 0;
+}
+
 /* Bind cpufreq callbacks to thermal cooling device ops */
-static struct thermal_cooling_device_ops const cpufreq_cooling_ops = {
+static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
 	.get_max_state = cpufreq_get_max_state,
 	.get_cur_state = cpufreq_get_cur_state,
 	.set_cur_state = cpufreq_set_cur_state,
@@ -311,6 +754,9 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
  * @np: a valid struct device_node to the cooling device device tree node
  * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
  * Normally this should be same as cpufreq policy->related_cpus.
+ * @capacitance: dynamic power coefficient for these cpus
+ * @plat_static_func: function to calculate the static power consumed by these
+ *                    cpus (optional)
  *
  * This interface function registers the cpufreq cooling device with the name
  * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
@@ -322,13 +768,14 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
  */
 static struct thermal_cooling_device *
 __cpufreq_cooling_register(struct device_node *np,
-			   const struct cpumask *clip_cpus)
+			const struct cpumask *clip_cpus, u32 capacitance,
+			get_static_t plat_static_func)
 {
 	struct thermal_cooling_device *cool_dev;
 	struct cpufreq_cooling_device *cpufreq_dev;
 	char dev_name[THERMAL_NAME_LENGTH];
 	struct cpufreq_frequency_table *pos, *table;
-	unsigned int freq, i;
+	unsigned int freq, i, num_cpus;
 	int ret;
 
 	table = cpufreq_frequency_get_table(cpumask_first(clip_cpus));
@@ -341,6 +788,23 @@ __cpufreq_cooling_register(struct device_node *np,
 	if (!cpufreq_dev)
 		return ERR_PTR(-ENOMEM);
 
+	num_cpus = cpumask_weight(clip_cpus);
+	cpufreq_dev->time_in_idle = kcalloc(num_cpus,
+					    sizeof(*cpufreq_dev->time_in_idle),
+					    GFP_KERNEL);
+	if (!cpufreq_dev->time_in_idle) {
+		cool_dev = ERR_PTR(-ENOMEM);
+		goto free_cdev;
+	}
+
+	cpufreq_dev->time_in_idle_timestamp =
+		kcalloc(num_cpus, sizeof(*cpufreq_dev->time_in_idle_timestamp),
+			GFP_KERNEL);
+	if (!cpufreq_dev->time_in_idle_timestamp) {
+		cool_dev = ERR_PTR(-ENOMEM);
+		goto free_time_in_idle;
+	}
+
 	/* Find max levels */
 	cpufreq_for_each_valid_entry(pos, table)
 		cpufreq_dev->max_level++;
@@ -349,7 +813,7 @@ __cpufreq_cooling_register(struct device_node *np,
 					  cpufreq_dev->max_level, GFP_KERNEL);
 	if (!cpufreq_dev->freq_table) {
 		cool_dev = ERR_PTR(-ENOMEM);
-		goto free_cdev;
+		goto free_time_in_idle_timestamp;
 	}
 
 	/* max_level is an index, not a counter */
@@ -357,6 +821,20 @@ __cpufreq_cooling_register(struct device_node *np,
 
 	cpumask_copy(&cpufreq_dev->allowed_cpus, clip_cpus);
 
+	if (capacitance) {
+		cpufreq_cooling_ops.get_requested_power =
+			cpufreq_get_requested_power;
+		cpufreq_cooling_ops.state2power = cpufreq_state2power;
+		cpufreq_cooling_ops.power2state = cpufreq_power2state;
+		cpufreq_dev->plat_get_static_power = plat_static_func;
+
+		ret = build_dyn_power_table(cpufreq_dev, capacitance);
+		if (ret) {
+			cool_dev = ERR_PTR(ret);
+			goto free_table;
+		}
+	}
+
 	ret = get_idr(&cpufreq_idr, &cpufreq_dev->id);
 	if (ret) {
 		cool_dev = ERR_PTR(ret);
@@ -402,6 +880,10 @@ remove_idr:
 	release_idr(&cpufreq_idr, cpufreq_dev->id);
 free_table:
 	kfree(cpufreq_dev->freq_table);
+free_time_in_idle_timestamp:
+	kfree(cpufreq_dev->time_in_idle_timestamp);
+free_time_in_idle:
+	kfree(cpufreq_dev->time_in_idle);
 free_cdev:
 	kfree(cpufreq_dev);
 
@@ -422,7 +904,7 @@ free_cdev:
 struct thermal_cooling_device *
 cpufreq_cooling_register(const struct cpumask *clip_cpus)
 {
-	return __cpufreq_cooling_register(NULL, clip_cpus);
+	return __cpufreq_cooling_register(NULL, clip_cpus, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
 
@@ -446,11 +928,78 @@ of_cpufreq_cooling_register(struct device_node *np,
 	if (!np)
 		return ERR_PTR(-EINVAL);
 
-	return __cpufreq_cooling_register(np, clip_cpus);
+	return __cpufreq_cooling_register(np, clip_cpus, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
 
 /**
+ * cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
+ * @clip_cpus:	cpumask of cpus where the frequency constraints will happen
+ * @capacitance:	dynamic power coefficient for these cpus
+ * @plat_static_func:	function to calculate the static power consumed by these
+ *			cpus (optional)
+ *
+ * This interface function registers the cpufreq cooling device with
+ * the name "thermal-cpufreq-%x".  This api can support multiple
+ * instances of cpufreq cooling devices.  Using this function, the
+ * cooling device will implement the power extensions by using a
+ * simple cpu power model.  The cpus must have registered their OPPs
+ * using the OPP library.
+ *
+ * An optional @plat_static_func may be provided to calculate the
+ * static power consumed by these cpus.  If the platform's static
+ * power consumption is unknown or negligible, make it NULL.
+ *
+ * Return: a valid struct thermal_cooling_device pointer on success,
+ * on failure, it returns a corresponding ERR_PTR().
+ */
+struct thermal_cooling_device *
+cpufreq_power_cooling_register(const struct cpumask *clip_cpus, u32 capacitance,
+			       get_static_t plat_static_func)
+{
+	return __cpufreq_cooling_register(NULL, clip_cpus, capacitance,
+				plat_static_func);
+}
+EXPORT_SYMBOL(cpufreq_power_cooling_register);
+
+/**
+ * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
+ * @np:	a valid struct device_node to the cooling device device tree node
+ * @clip_cpus:	cpumask of cpus where the frequency constraints will happen
+ * @capacitance:	dynamic power coefficient for these cpus
+ * @plat_static_func:	function to calculate the static power consumed by these
+ *			cpus (optional)
+ *
+ * This interface function registers the cpufreq cooling device with
+ * the name "thermal-cpufreq-%x".  This api can support multiple
+ * instances of cpufreq cooling devices.  Using this API, the cpufreq
+ * cooling device will be linked to the device tree node provided.
+ * Using this function, the cooling device will implement the power
+ * extensions by using a simple cpu power model.  The cpus must have
+ * registered their OPPs using the OPP library.
+ *
+ * An optional @plat_static_func may be provided to calculate the
+ * static power consumed by these cpus.  If the platform's static
+ * power consumption is unknown or negligible, make it NULL.
+ *
+ * Return: a valid struct thermal_cooling_device pointer on success,
+ * on failure, it returns a corresponding ERR_PTR().
+ */
+struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func)
+{
+	if (!np)
+		return ERR_PTR(-EINVAL);
+
+	return __cpufreq_cooling_register(np, clip_cpus, capacitance,
+				plat_static_func);
+}
+EXPORT_SYMBOL(of_cpufreq_power_cooling_register);
+
+/**
  * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
  * @cdev: thermal cooling device pointer.
  *
@@ -475,6 +1024,8 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
 	thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
 	release_idr(&cpufreq_idr, cpufreq_dev->id);
+	kfree(cpufreq_dev->time_in_idle_timestamp);
+	kfree(cpufreq_dev->time_in_idle);
 	kfree(cpufreq_dev->freq_table);
 	kfree(cpufreq_dev);
 }
diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c
index 20adfbe27df1..2fb273c4baa9 100644
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c
@@ -76,7 +76,7 @@ static int db8500_cdev_bind(struct thermal_zone_device *thermal,
 		upper = lower = i > max_state ? max_state : i;
 
 		ret = thermal_zone_bind_cooling_device(thermal, i, cdev,
-			upper, lower);
+			upper, lower, THERMAL_WEIGHT_DEFAULT);
 
 		dev_info(&cdev->device, "%s bind to %d: %d-%s\n", cdev->type,
 			i, ret, ret ? "fail" : "succeed");
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
new file mode 100644
index 000000000000..01f0015f80dc
--- /dev/null
+++ b/drivers/thermal/devfreq_cooling.c
@@ -0,0 +1,573 @@
+/*
+ * devfreq_cooling: Thermal cooling device implementation for devices using
+ *                  devfreq
+ *
+ * Copyright (C) 2014-2015 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * TODO:
+ *    - If OPPs are added or removed after devfreq cooling has
+ *      registered, the devfreq cooling won't react to it.
+ */
+
+#include <linux/devfreq.h>
+#include <linux/devfreq_cooling.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/pm_opp.h>
+#include <linux/thermal.h>
+
+#include <trace/events/thermal.h>
+
+static DEFINE_MUTEX(devfreq_lock);
+static DEFINE_IDR(devfreq_idr);
+
+/**
+ * struct devfreq_cooling_device - Devfreq cooling device
+ * @id:		unique integer value corresponding to each
+ *		devfreq_cooling_device registered.
+ * @cdev:	Pointer to associated thermal cooling device.
+ * @devfreq:	Pointer to associated devfreq device.
+ * @cooling_state:	Current cooling state.
+ * @power_table:	Pointer to table with maximum power draw for each
+ *			cooling state. State is the index into the table, and
+ *			the power is in mW.
+ * @freq_table:	Pointer to a table with the frequencies sorted in descending
+ *		order.  You can index the table by cooling device state
+ * @freq_table_size:	Size of the @freq_table and @power_table
+ * @power_ops:	Pointer to devfreq_cooling_power, used to generate the
+ *		@power_table.
+ */
+struct devfreq_cooling_device {
+	int id;
+	struct thermal_cooling_device *cdev;
+	struct devfreq *devfreq;
+	unsigned long cooling_state;
+	u32 *power_table;
+	u32 *freq_table;
+	size_t freq_table_size;
+	struct devfreq_cooling_power *power_ops;
+};
+
+/**
+ * get_idr - function to get a unique id.
+ * @idr: struct idr * handle used to create a id.
+ * @id: int * value generated by this function.
+ *
+ * This function will populate @id with an unique
+ * id, using the idr API.
+ *
+ * Return: 0 on success, an error code on failure.
+ */
+static int get_idr(struct idr *idr, int *id)
+{
+	int ret;
+
+	mutex_lock(&devfreq_lock);
+	ret = idr_alloc(idr, NULL, 0, 0, GFP_KERNEL);
+	mutex_unlock(&devfreq_lock);
+	if (unlikely(ret < 0))
+		return ret;
+	*id = ret;
+
+	return 0;
+}
+
+/**
+ * release_idr - function to free the unique id.
+ * @idr: struct idr * handle used for creating the id.
+ * @id: int value representing the unique id.
+ */
+static void release_idr(struct idr *idr, int id)
+{
+	mutex_lock(&devfreq_lock);
+	idr_remove(idr, id);
+	mutex_unlock(&devfreq_lock);
+}
+
+/**
+ * partition_enable_opps() - disable all opps above a given state
+ * @dfc:	Pointer to devfreq we are operating on
+ * @cdev_state:	cooling device state we're setting
+ *
+ * Go through the OPPs of the device, enabling all OPPs until
+ * @cdev_state and disabling those frequencies above it.
+ */
+static int partition_enable_opps(struct devfreq_cooling_device *dfc,
+				 unsigned long cdev_state)
+{
+	int i;
+	struct device *dev = dfc->devfreq->dev.parent;
+
+	for (i = 0; i < dfc->freq_table_size; i++) {
+		struct dev_pm_opp *opp;
+		int ret = 0;
+		unsigned int freq = dfc->freq_table[i];
+		bool want_enable = i >= cdev_state ? true : false;
+
+		rcu_read_lock();
+		opp = dev_pm_opp_find_freq_exact(dev, freq, !want_enable);
+		rcu_read_unlock();
+
+		if (PTR_ERR(opp) == -ERANGE)
+			continue;
+		else if (IS_ERR(opp))
+			return PTR_ERR(opp);
+
+		if (want_enable)
+			ret = dev_pm_opp_enable(dev, freq);
+		else
+			ret = dev_pm_opp_disable(dev, freq);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev,
+					 unsigned long *state)
+{
+	struct devfreq_cooling_device *dfc = cdev->devdata;
+
+	*state = dfc->freq_table_size - 1;
+
+	return 0;
+}
+
+static int devfreq_cooling_get_cur_state(struct thermal_cooling_device *cdev,
+					 unsigned long *state)
+{
+	struct devfreq_cooling_device *dfc = cdev->devdata;
+
+	*state = dfc->cooling_state;
+
+	return 0;
+}
+
+static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
+					 unsigned long state)
+{
+	struct devfreq_cooling_device *dfc = cdev->devdata;
+	struct devfreq *df = dfc->devfreq;
+	struct device *dev = df->dev.parent;
+	int ret;
+
+	if (state == dfc->cooling_state)
+		return 0;
+
+	dev_dbg(dev, "Setting cooling state %lu\n", state);
+
+	if (state >= dfc->freq_table_size)
+		return -EINVAL;
+
+	ret = partition_enable_opps(dfc, state);
+	if (ret)
+		return ret;
+
+	dfc->cooling_state = state;
+
+	return 0;
+}
+
+/**
+ * freq_get_state() - get the cooling state corresponding to a frequency
+ * @dfc:	Pointer to devfreq cooling device
+ * @freq:	frequency in Hz
+ *
+ * Return: the cooling state associated with the @freq, or
+ * THERMAL_CSTATE_INVALID if it wasn't found.
+ */
+static unsigned long
+freq_get_state(struct devfreq_cooling_device *dfc, unsigned long freq)
+{
+	int i;
+
+	for (i = 0; i < dfc->freq_table_size; i++) {
+		if (dfc->freq_table[i] == freq)
+			return i;
+	}
+
+	return THERMAL_CSTATE_INVALID;
+}
+
+/**
+ * get_static_power() - calculate the static power
+ * @dfc:	Pointer to devfreq cooling device
+ * @freq:	Frequency in Hz
+ *
+ * Calculate the static power in milliwatts using the supplied
+ * get_static_power().  The current voltage is calculated using the
+ * OPP library.  If no get_static_power() was supplied, assume the
+ * static power is negligible.
+ */
+static unsigned long
+get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq)
+{
+	struct devfreq *df = dfc->devfreq;
+	struct device *dev = df->dev.parent;
+	unsigned long voltage;
+	struct dev_pm_opp *opp;
+
+	if (!dfc->power_ops->get_static_power)
+		return 0;
+
+	rcu_read_lock();
+
+	opp = dev_pm_opp_find_freq_exact(dev, freq, true);
+	if (IS_ERR(opp) && (PTR_ERR(opp) == -ERANGE))
+		opp = dev_pm_opp_find_freq_exact(dev, freq, false);
+
+	voltage = dev_pm_opp_get_voltage(opp) / 1000; /* mV */
+
+	rcu_read_unlock();
+
+	if (voltage == 0) {
+		dev_warn_ratelimited(dev,
+				     "Failed to get voltage for frequency %lu: %ld\n",
+				     freq, IS_ERR(opp) ? PTR_ERR(opp) : 0);
+		return 0;
+	}
+
+	return dfc->power_ops->get_static_power(voltage);
+}
+
+/**
+ * get_dynamic_power - calculate the dynamic power
+ * @dfc:	Pointer to devfreq cooling device
+ * @freq:	Frequency in Hz
+ * @voltage:	Voltage in millivolts
+ *
+ * Calculate the dynamic power in milliwatts consumed by the device at
+ * frequency @freq and voltage @voltage.  If the get_dynamic_power()
+ * was supplied as part of the devfreq_cooling_power struct, then that
+ * function is used.  Otherwise, a simple power model (Pdyn = Coeff *
+ * Voltage^2 * Frequency) is used.
+ */
+static unsigned long
+get_dynamic_power(struct devfreq_cooling_device *dfc, unsigned long freq,
+		  unsigned long voltage)
+{
+	u64 power;
+	u32 freq_mhz;
+	struct devfreq_cooling_power *dfc_power = dfc->power_ops;
+
+	if (dfc_power->get_dynamic_power)
+		return dfc_power->get_dynamic_power(freq, voltage);
+
+	freq_mhz = freq / 1000000;
+	power = (u64)dfc_power->dyn_power_coeff * freq_mhz * voltage * voltage;
+	do_div(power, 1000000000);
+
+	return power;
+}
+
+static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,
+					       struct thermal_zone_device *tz,
+					       u32 *power)
+{
+	struct devfreq_cooling_device *dfc = cdev->devdata;
+	struct devfreq *df = dfc->devfreq;
+	struct devfreq_dev_status *status = &df->last_status;
+	unsigned long state;
+	unsigned long freq = status->current_frequency;
+	u32 dyn_power, static_power;
+
+	/* Get dynamic power for state */
+	state = freq_get_state(dfc, freq);
+	if (state == THERMAL_CSTATE_INVALID)
+		return -EAGAIN;
+
+	dyn_power = dfc->power_table[state];
+
+	/* Scale dynamic power for utilization */
+	dyn_power = (dyn_power * status->busy_time) / status->total_time;
+
+	/* Get static power */
+	static_power = get_static_power(dfc, freq);
+
+	trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power,
+					      static_power);
+
+	*power = dyn_power + static_power;
+
+	return 0;
+}
+
+static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
+				       struct thermal_zone_device *tz,
+				       unsigned long state,
+				       u32 *power)
+{
+	struct devfreq_cooling_device *dfc = cdev->devdata;
+	unsigned long freq;
+	u32 static_power;
+
+	if (state < 0 || state >= dfc->freq_table_size)
+		return -EINVAL;
+
+	freq = dfc->freq_table[state];
+	static_power = get_static_power(dfc, freq);
+
+	*power = dfc->power_table[state] + static_power;
+	return 0;
+}
+
+static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
+				       struct thermal_zone_device *tz,
+				       u32 power, unsigned long *state)
+{
+	struct devfreq_cooling_device *dfc = cdev->devdata;
+	struct devfreq *df = dfc->devfreq;
+	struct devfreq_dev_status *status = &df->last_status;
+	unsigned long freq = status->current_frequency;
+	unsigned long busy_time;
+	s32 dyn_power;
+	u32 static_power;
+	int i;
+
+	static_power = get_static_power(dfc, freq);
+
+	dyn_power = power - static_power;
+	dyn_power = dyn_power > 0 ? dyn_power : 0;
+
+	/* Scale dynamic power for utilization */
+	busy_time = status->busy_time ?: 1;
+	dyn_power = (dyn_power * status->total_time) / busy_time;
+
+	/*
+	 * Find the first cooling state that is within the power
+	 * budget for dynamic power.
+	 */
+	for (i = 0; i < dfc->freq_table_size - 1; i++)
+		if (dyn_power >= dfc->power_table[i])
+			break;
+
+	*state = i;
+	trace_thermal_power_devfreq_limit(cdev, freq, *state, power);
+	return 0;
+}
+
+static struct thermal_cooling_device_ops devfreq_cooling_ops = {
+	.get_max_state = devfreq_cooling_get_max_state,
+	.get_cur_state = devfreq_cooling_get_cur_state,
+	.set_cur_state = devfreq_cooling_set_cur_state,
+};
+
+/**
+ * devfreq_cooling_gen_tables() - Generate power and freq tables.
+ * @dfc: Pointer to devfreq cooling device.
+ *
+ * Generate power and frequency tables: the power table hold the
+ * device's maximum power usage at each cooling state (OPP).  The
+ * static and dynamic power using the appropriate voltage and
+ * frequency for the state, is acquired from the struct
+ * devfreq_cooling_power, and summed to make the maximum power draw.
+ *
+ * The frequency table holds the frequencies in descending order.
+ * That way its indexed by cooling device state.
+ *
+ * The tables are malloced, and pointers put in dfc.  They must be
+ * freed when unregistering the devfreq cooling device.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
+{
+	struct devfreq *df = dfc->devfreq;
+	struct device *dev = df->dev.parent;
+	int ret, num_opps;
+	unsigned long freq;
+	u32 *power_table = NULL;
+	u32 *freq_table;
+	int i;
+
+	num_opps = dev_pm_opp_get_opp_count(dev);
+
+	if (dfc->power_ops) {
+		power_table = kcalloc(num_opps, sizeof(*power_table),
+				      GFP_KERNEL);
+		if (!power_table)
+			return -ENOMEM;
+	}
+
+	freq_table = kcalloc(num_opps, sizeof(*freq_table),
+			     GFP_KERNEL);
+	if (!freq_table) {
+		ret = -ENOMEM;
+		goto free_power_table;
+	}
+
+	for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) {
+		unsigned long power_dyn, voltage;
+		struct dev_pm_opp *opp;
+
+		rcu_read_lock();
+
+		opp = dev_pm_opp_find_freq_floor(dev, &freq);
+		if (IS_ERR(opp)) {
+			rcu_read_unlock();
+			ret = PTR_ERR(opp);
+			goto free_tables;
+		}
+
+		voltage = dev_pm_opp_get_voltage(opp) / 1000; /* mV */
+
+		rcu_read_unlock();
+
+		if (dfc->power_ops) {
+			power_dyn = get_dynamic_power(dfc, freq, voltage);
+
+			dev_dbg(dev, "Dynamic power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
+				freq / 1000000, voltage, power_dyn, power_dyn);
+
+			power_table[i] = power_dyn;
+		}
+
+		freq_table[i] = freq;
+	}
+
+	if (dfc->power_ops)
+		dfc->power_table = power_table;
+
+	dfc->freq_table = freq_table;
+	dfc->freq_table_size = num_opps;
+
+	return 0;
+
+free_tables:
+	kfree(freq_table);
+free_power_table:
+	kfree(power_table);
+
+	return ret;
+}
+
+/**
+ * of_devfreq_cooling_register_power() - Register devfreq cooling device,
+ *                                      with OF and power information.
+ * @np:	Pointer to OF device_node.
+ * @df:	Pointer to devfreq device.
+ * @dfc_power:	Pointer to devfreq_cooling_power.
+ *
+ * Register a devfreq cooling device.  The available OPPs must be
+ * registered on the device.
+ *
+ * If @dfc_power is provided, the cooling device is registered with the
+ * power extensions.  For the power extensions to work correctly,
+ * devfreq should use the simple_ondemand governor, other governors
+ * are not currently supported.
+ */
+struct thermal_cooling_device *
+of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
+				  struct devfreq_cooling_power *dfc_power)
+{
+	struct thermal_cooling_device *cdev;
+	struct devfreq_cooling_device *dfc;
+	char dev_name[THERMAL_NAME_LENGTH];
+	int err;
+
+	dfc = kzalloc(sizeof(*dfc), GFP_KERNEL);
+	if (!dfc)
+		return ERR_PTR(-ENOMEM);
+
+	dfc->devfreq = df;
+
+	if (dfc_power) {
+		dfc->power_ops = dfc_power;
+
+		devfreq_cooling_ops.get_requested_power =
+			devfreq_cooling_get_requested_power;
+		devfreq_cooling_ops.state2power = devfreq_cooling_state2power;
+		devfreq_cooling_ops.power2state = devfreq_cooling_power2state;
+	}
+
+	err = devfreq_cooling_gen_tables(dfc);
+	if (err)
+		goto free_dfc;
+
+	err = get_idr(&devfreq_idr, &dfc->id);
+	if (err)
+		goto free_tables;
+
+	snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id);
+
+	cdev = thermal_of_cooling_device_register(np, dev_name, dfc,
+						  &devfreq_cooling_ops);
+	if (IS_ERR(cdev)) {
+		err = PTR_ERR(cdev);
+		dev_err(df->dev.parent,
+			"Failed to register devfreq cooling device (%d)\n",
+			err);
+		goto release_idr;
+	}
+
+	dfc->cdev = cdev;
+
+	return cdev;
+
+release_idr:
+	release_idr(&devfreq_idr, dfc->id);
+free_tables:
+	kfree(dfc->power_table);
+	kfree(dfc->freq_table);
+free_dfc:
+	kfree(dfc);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(of_devfreq_cooling_register_power);
+
+/**
+ * of_devfreq_cooling_register() - Register devfreq cooling device,
+ *                                with OF information.
+ * @np: Pointer to OF device_node.
+ * @df: Pointer to devfreq device.
+ */
+struct thermal_cooling_device *
+of_devfreq_cooling_register(struct device_node *np, struct devfreq *df)
+{
+	return of_devfreq_cooling_register_power(np, df, NULL);
+}
+EXPORT_SYMBOL_GPL(of_devfreq_cooling_register);
+
+/**
+ * devfreq_cooling_register() - Register devfreq cooling device.
+ * @df: Pointer to devfreq device.
+ */
+struct thermal_cooling_device *devfreq_cooling_register(struct devfreq *df)
+{
+	return of_devfreq_cooling_register(NULL, df);
+}
+EXPORT_SYMBOL_GPL(devfreq_cooling_register);
+
+/**
+ * devfreq_cooling_unregister() - Unregister devfreq cooling device.
+ * @dfc: Pointer to devfreq cooling device to unregister.
+ */
+void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
+{
+	struct devfreq_cooling_device *dfc;
+
+	if (!cdev)
+		return;
+
+	dfc = cdev->devdata;
+
+	thermal_cooling_device_unregister(dfc->cdev);
+	release_idr(&devfreq_idr, dfc->id);
+	kfree(dfc->power_table);
+	kfree(dfc->freq_table);
+
+	kfree(dfc);
+}
+EXPORT_SYMBOL_GPL(devfreq_cooling_unregister);
diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c
index 6e0a3fbfae86..8c50b8d6afb7 100644
--- a/drivers/thermal/fair_share.c
+++ b/drivers/thermal/fair_share.c
@@ -59,13 +59,13 @@ static int get_trip_level(struct thermal_zone_device *tz)
 }
 
 static long get_target_state(struct thermal_zone_device *tz,
-		struct thermal_cooling_device *cdev, int weight, int level)
+		struct thermal_cooling_device *cdev, int percentage, int level)
 {
 	unsigned long max_state;
 
 	cdev->ops->get_max_state(cdev, &max_state);
 
-	return (long)(weight * level * max_state) / (100 * tz->trips);
+	return (long)(percentage * level * max_state) / (100 * tz->trips);
 }
 
 /**
@@ -77,7 +77,7 @@ static long get_target_state(struct thermal_zone_device *tz,
  *
  * Parameters used for Throttling:
  * P1. max_state: Maximum throttle state exposed by the cooling device.
- * P2. weight[i]/100:
+ * P2. percentage[i]/100:
  *	How 'effective' the 'i'th device is, in cooling the given zone.
  * P3. cur_trip_level/max_no_of_trips:
  *	This describes the extent to which the devices should be throttled.
@@ -88,28 +88,33 @@ static long get_target_state(struct thermal_zone_device *tz,
  */
 static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 {
-	const struct thermal_zone_params *tzp;
-	struct thermal_cooling_device *cdev;
 	struct thermal_instance *instance;
-	int i;
+	int total_weight = 0;
+	int total_instance = 0;
 	int cur_trip_level = get_trip_level(tz);
 
-	if (!tz->tzp || !tz->tzp->tbp)
-		return -EINVAL;
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if (instance->trip != trip)
+			continue;
+
+		total_weight += instance->weight;
+		total_instance++;
+	}
 
-	tzp = tz->tzp;
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		int percentage;
+		struct thermal_cooling_device *cdev = instance->cdev;
 
-	for (i = 0; i < tzp->num_tbps; i++) {
-		if (!tzp->tbp[i].cdev)
+		if (instance->trip != trip)
 			continue;
 
-		cdev = tzp->tbp[i].cdev;
-		instance = get_thermal_instance(tz, cdev, trip);
-		if (!instance)
-			continue;
+		if (!total_weight)
+			percentage = 100 / total_instance;
+		else
+			percentage = (instance->weight * 100) / total_weight;
 
-		instance->target = get_target_state(tz, cdev,
-					tzp->tbp[i].weight, cur_trip_level);
+		instance->target = get_target_state(tz, cdev, percentage,
+						    cur_trip_level);
 
 		instance->cdev->updated = false;
 		thermal_cdev_update(cdev);
diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c
index 2ccbc0788353..fde4c2876d14 100644
--- a/drivers/thermal/imx_thermal.c
+++ b/drivers/thermal/imx_thermal.c
@@ -306,7 +306,8 @@ static int imx_bind(struct thermal_zone_device *tz,
 
 	ret = thermal_zone_bind_cooling_device(tz, IMX_TRIP_PASSIVE, cdev,
 					       THERMAL_NO_LIMIT,
-					       THERMAL_NO_LIMIT);
+					       THERMAL_NO_LIMIT,
+					       THERMAL_WEIGHT_DEFAULT);
 	if (ret) {
 		dev_err(&tz->device,
 			"binding zone %s with cdev %s failed:%d\n",
diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index 668fb1bdea9e..b295b2b6c191 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -58,6 +58,8 @@ struct __thermal_bind_params {
  * @mode: current thermal zone device mode (enabled/disabled)
  * @passive_delay: polling interval while passive cooling is activated
  * @polling_delay: zone polling interval
+ * @slope: slope of the temperature adjustment curve
+ * @offset: offset of the temperature adjustment curve
  * @ntrips: number of trip points
  * @trips: an array of trip points (0..ntrips - 1)
  * @num_tbps: number of thermal bind params
@@ -70,6 +72,8 @@ struct __thermal_zone {
 	enum thermal_device_mode mode;
 	int passive_delay;
 	int polling_delay;
+	int slope;
+	int offset;
 
 	/* trip data */
 	int ntrips;
@@ -227,7 +231,8 @@ static int of_thermal_bind(struct thermal_zone_device *thermal,
 			ret = thermal_zone_bind_cooling_device(thermal,
 						tbp->trip_id, cdev,
 						tbp->max,
-						tbp->min);
+						tbp->min,
+						tbp->usage);
 			if (ret)
 				return ret;
 		}
@@ -581,7 +586,7 @@ static int thermal_of_populate_bind_params(struct device_node *np,
 	u32 prop;
 
 	/* Default weight. Usage is optional */
-	__tbp->usage = 0;
+	__tbp->usage = THERMAL_WEIGHT_DEFAULT;
 	ret = of_property_read_u32(np, "contribution", &prop);
 	if (ret == 0)
 		__tbp->usage = prop;
@@ -715,7 +720,7 @@ static int thermal_of_populate_trip(struct device_node *np,
  * @np parameter and fills the read data into a __thermal_zone data structure
  * and return this pointer.
  *
- * TODO: Missing properties to parse: thermal-sensor-names and coefficients
+ * TODO: Missing properties to parse: thermal-sensor-names
  *
  * Return: On success returns a valid struct __thermal_zone,
  * otherwise, it returns a corresponding ERR_PTR(). Caller must
@@ -727,7 +732,7 @@ thermal_of_build_thermal_zone(struct device_node *np)
 	struct device_node *child = NULL, *gchild;
 	struct __thermal_zone *tz;
 	int ret, i;
-	u32 prop;
+	u32 prop, coef[2];
 
 	if (!np) {
 		pr_err("no thermal zone np\n");
@@ -752,6 +757,20 @@ thermal_of_build_thermal_zone(struct device_node *np)
 	}
 	tz->polling_delay = prop;
 
+	/*
+	 * REVIST: for now, the thermal framework supports only
+	 * one sensor per thermal zone. Thus, we are considering
+	 * only the first two values as slope and offset.
+	 */
+	ret = of_property_read_u32_array(np, "coefficients", coef, 2);
+	if (ret == 0) {
+		tz->slope = coef[0];
+		tz->offset = coef[1];
+	} else {
+		tz->slope = 1;
+		tz->offset = 0;
+	}
+
 	/* trips */
 	child = of_get_child_by_name(np, "trips");
 
@@ -865,6 +884,8 @@ int __init of_parse_thermal_zones(void)
 	for_each_child_of_node(np, child) {
 		struct thermal_zone_device *zone;
 		struct thermal_zone_params *tzp;
+		int i, mask = 0;
+		u32 prop;
 
 		/* Check whether child is enabled or not */
 		if (!of_device_is_available(child))
@@ -891,8 +912,18 @@ int __init of_parse_thermal_zones(void)
 		/* No hwmon because there might be hwmon drivers registering */
 		tzp->no_hwmon = true;
 
+		if (!of_property_read_u32(child, "sustainable-power", &prop))
+			tzp->sustainable_power = prop;
+
+		for (i = 0; i < tz->ntrips; i++)
+			mask |= 1 << i;
+
+		/* these two are left for temperature drivers to use */
+		tzp->slope = tz->slope;
+		tzp->offset = tz->offset;
+
 		zone = thermal_zone_device_register(child->name, tz->ntrips,
-						    0, tz,
+						    mask, tz,
 						    ops, tzp,
 						    tz->passive_delay,
 						    tz->polling_delay);
diff --git a/drivers/thermal/power_allocator.c b/drivers/thermal/power_allocator.c
new file mode 100644
index 000000000000..251676902869
--- /dev/null
+++ b/drivers/thermal/power_allocator.c
@@ -0,0 +1,544 @@
+/*
+ * A power allocator to manage temperature
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "Power allocator: " fmt
+
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/thermal_power_allocator.h>
+
+#include "thermal_core.h"
+
+#define FRAC_BITS 10
+#define int_to_frac(x) ((x) << FRAC_BITS)
+#define frac_to_int(x) ((x) >> FRAC_BITS)
+
+/**
+ * mul_frac() - multiply two fixed-point numbers
+ * @x:	first multiplicand
+ * @y:	second multiplicand
+ *
+ * Return: the result of multiplying two fixed-point numbers.  The
+ * result is also a fixed-point number.
+ */
+static inline s64 mul_frac(s64 x, s64 y)
+{
+	return (x * y) >> FRAC_BITS;
+}
+
+/**
+ * div_frac() - divide two fixed-point numbers
+ * @x:	the dividend
+ * @y:	the divisor
+ *
+ * Return: the result of dividing two fixed-point numbers.  The
+ * result is also a fixed-point number.
+ */
+static inline s64 div_frac(s64 x, s64 y)
+{
+	return div_s64(x << FRAC_BITS, y);
+}
+
+/**
+ * struct power_allocator_params - parameters for the power allocator governor
+ * @err_integral:	accumulated error in the PID controller.
+ * @prev_err:	error in the previous iteration of the PID controller.
+ *		Used to calculate the derivative term.
+ * @trip_switch_on:	first passive trip point of the thermal zone.  The
+ *			governor switches on when this trip point is crossed.
+ * @trip_max_desired_temperature:	last passive trip point of the thermal
+ *					zone.  The temperature we are
+ *					controlling for.
+ */
+struct power_allocator_params {
+	s64 err_integral;
+	s32 prev_err;
+	int trip_switch_on;
+	int trip_max_desired_temperature;
+};
+
+/**
+ * pid_controller() - PID controller
+ * @tz:	thermal zone we are operating in
+ * @current_temp:	the current temperature in millicelsius
+ * @control_temp:	the target temperature in millicelsius
+ * @max_allocatable_power:	maximum allocatable power for this thermal zone
+ *
+ * This PID controller increases the available power budget so that the
+ * temperature of the thermal zone gets as close as possible to
+ * @control_temp and limits the power if it exceeds it.  k_po is the
+ * proportional term when we are overshooting, k_pu is the
+ * proportional term when we are undershooting.  integral_cutoff is a
+ * threshold below which we stop accumulating the error.  The
+ * accumulated error is only valid if the requested power will make
+ * the system warmer.  If the system is mostly idle, there's no point
+ * in accumulating positive error.
+ *
+ * Return: The power budget for the next period.
+ */
+static u32 pid_controller(struct thermal_zone_device *tz,
+			  unsigned long current_temp,
+			  unsigned long control_temp,
+			  u32 max_allocatable_power)
+{
+	s64 p, i, d, power_range;
+	s32 err, max_power_frac;
+	struct power_allocator_params *params = tz->governor_data;
+
+	max_power_frac = int_to_frac(max_allocatable_power);
+
+	err = ((s32)control_temp - (s32)current_temp);
+	err = int_to_frac(err);
+
+	/* Calculate the proportional term */
+	p = mul_frac(err < 0 ? tz->tzp->k_po : tz->tzp->k_pu, err);
+
+	/*
+	 * Calculate the integral term
+	 *
+	 * if the error is less than cut off allow integration (but
+	 * the integral is limited to max power)
+	 */
+	i = mul_frac(tz->tzp->k_i, params->err_integral);
+
+	if (err < int_to_frac(tz->tzp->integral_cutoff)) {
+		s64 i_next = i + mul_frac(tz->tzp->k_i, err);
+
+		if (abs64(i_next) < max_power_frac) {
+			i = i_next;
+			params->err_integral += err;
+		}
+	}
+
+	/*
+	 * Calculate the derivative term
+	 *
+	 * We do err - prev_err, so with a positive k_d, a decreasing
+	 * error (i.e. driving closer to the line) results in less
+	 * power being applied, slowing down the controller)
+	 */
+	d = mul_frac(tz->tzp->k_d, err - params->prev_err);
+	d = div_frac(d, tz->passive_delay);
+	params->prev_err = err;
+
+	power_range = p + i + d;
+
+	/* feed-forward the known sustainable dissipatable power */
+	power_range = tz->tzp->sustainable_power + frac_to_int(power_range);
+
+	power_range = clamp(power_range, (s64)0, (s64)max_allocatable_power);
+
+	trace_thermal_power_allocator_pid(tz, frac_to_int(err),
+					  frac_to_int(params->err_integral),
+					  frac_to_int(p), frac_to_int(i),
+					  frac_to_int(d), power_range);
+
+	return power_range;
+}
+
+/**
+ * divvy_up_power() - divvy the allocated power between the actors
+ * @req_power:	each actor's requested power
+ * @max_power:	each actor's maximum available power
+ * @num_actors:	size of the @req_power, @max_power and @granted_power's array
+ * @total_req_power: sum of @req_power
+ * @power_range:	total allocated power
+ * @granted_power:	output array: each actor's granted power
+ * @extra_actor_power:	an appropriately sized array to be used in the
+ *			function as temporary storage of the extra power given
+ *			to the actors
+ *
+ * This function divides the total allocated power (@power_range)
+ * fairly between the actors.  It first tries to give each actor a
+ * share of the @power_range according to how much power it requested
+ * compared to the rest of the actors.  For example, if only one actor
+ * requests power, then it receives all the @power_range.  If
+ * three actors each requests 1mW, each receives a third of the
+ * @power_range.
+ *
+ * If any actor received more than their maximum power, then that
+ * surplus is re-divvied among the actors based on how far they are
+ * from their respective maximums.
+ *
+ * Granted power for each actor is written to @granted_power, which
+ * should've been allocated by the calling function.
+ */
+static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
+			   u32 total_req_power, u32 power_range,
+			   u32 *granted_power, u32 *extra_actor_power)
+{
+	u32 extra_power, capped_extra_power;
+	int i;
+
+	/*
+	 * Prevent division by 0 if none of the actors request power.
+	 */
+	if (!total_req_power)
+		total_req_power = 1;
+
+	capped_extra_power = 0;
+	extra_power = 0;
+	for (i = 0; i < num_actors; i++) {
+		u64 req_range = req_power[i] * power_range;
+
+		granted_power[i] = DIV_ROUND_CLOSEST_ULL(req_range,
+							 total_req_power);
+
+		if (granted_power[i] > max_power[i]) {
+			extra_power += granted_power[i] - max_power[i];
+			granted_power[i] = max_power[i];
+		}
+
+		extra_actor_power[i] = max_power[i] - granted_power[i];
+		capped_extra_power += extra_actor_power[i];
+	}
+
+	if (!extra_power)
+		return;
+
+	/*
+	 * Re-divvy the reclaimed extra among actors based on
+	 * how far they are from the max
+	 */
+	extra_power = min(extra_power, capped_extra_power);
+	if (capped_extra_power > 0)
+		for (i = 0; i < num_actors; i++)
+			granted_power[i] += (extra_actor_power[i] *
+					extra_power) / capped_extra_power;
+}
+
+static int allocate_power(struct thermal_zone_device *tz,
+			  unsigned long current_temp,
+			  unsigned long control_temp)
+{
+	struct thermal_instance *instance;
+	struct power_allocator_params *params = tz->governor_data;
+	u32 *req_power, *max_power, *granted_power, *extra_actor_power;
+	u32 *weighted_req_power;
+	u32 total_req_power, max_allocatable_power, total_weighted_req_power;
+	u32 total_granted_power, power_range;
+	int i, num_actors, total_weight, ret = 0;
+	int trip_max_desired_temperature = params->trip_max_desired_temperature;
+
+	mutex_lock(&tz->lock);
+
+	num_actors = 0;
+	total_weight = 0;
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if ((instance->trip == trip_max_desired_temperature) &&
+		    cdev_is_power_actor(instance->cdev)) {
+			num_actors++;
+			total_weight += instance->weight;
+		}
+	}
+
+	/*
+	 * We need to allocate five arrays of the same size:
+	 * req_power, max_power, granted_power, extra_actor_power and
+	 * weighted_req_power.  They are going to be needed until this
+	 * function returns.  Allocate them all in one go to simplify
+	 * the allocation and deallocation logic.
+	 */
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*max_power));
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*granted_power));
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*extra_actor_power));
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*weighted_req_power));
+	req_power = kcalloc(num_actors * 5, sizeof(*req_power), GFP_KERNEL);
+	if (!req_power) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	max_power = &req_power[num_actors];
+	granted_power = &req_power[2 * num_actors];
+	extra_actor_power = &req_power[3 * num_actors];
+	weighted_req_power = &req_power[4 * num_actors];
+
+	i = 0;
+	total_weighted_req_power = 0;
+	total_req_power = 0;
+	max_allocatable_power = 0;
+
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		int weight;
+		struct thermal_cooling_device *cdev = instance->cdev;
+
+		if (instance->trip != trip_max_desired_temperature)
+			continue;
+
+		if (!cdev_is_power_actor(cdev))
+			continue;
+
+		if (cdev->ops->get_requested_power(cdev, tz, &req_power[i]))
+			continue;
+
+		if (!total_weight)
+			weight = 1 << FRAC_BITS;
+		else
+			weight = instance->weight;
+
+		weighted_req_power[i] = frac_to_int(weight * req_power[i]);
+
+		if (power_actor_get_max_power(cdev, tz, &max_power[i]))
+			continue;
+
+		total_req_power += req_power[i];
+		max_allocatable_power += max_power[i];
+		total_weighted_req_power += weighted_req_power[i];
+
+		i++;
+	}
+
+	power_range = pid_controller(tz, current_temp, control_temp,
+				     max_allocatable_power);
+
+	divvy_up_power(weighted_req_power, max_power, num_actors,
+		       total_weighted_req_power, power_range, granted_power,
+		       extra_actor_power);
+
+	total_granted_power = 0;
+	i = 0;
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if (instance->trip != trip_max_desired_temperature)
+			continue;
+
+		if (!cdev_is_power_actor(instance->cdev))
+			continue;
+
+		power_actor_set_power(instance->cdev, instance,
+				      granted_power[i]);
+		total_granted_power += granted_power[i];
+
+		i++;
+	}
+
+	trace_thermal_power_allocator(tz, req_power, total_req_power,
+				      granted_power, total_granted_power,
+				      num_actors, power_range,
+				      max_allocatable_power, current_temp,
+				      (s32)control_temp - (s32)current_temp);
+
+	kfree(req_power);
+unlock:
+	mutex_unlock(&tz->lock);
+
+	return ret;
+}
+
+static int get_governor_trips(struct thermal_zone_device *tz,
+			      struct power_allocator_params *params)
+{
+	int i, ret, last_passive;
+	bool found_first_passive;
+
+	found_first_passive = false;
+	last_passive = -1;
+	ret = -EINVAL;
+
+	for (i = 0; i < tz->trips; i++) {
+		enum thermal_trip_type type;
+
+		ret = tz->ops->get_trip_type(tz, i, &type);
+		if (ret)
+			return ret;
+
+		if (!found_first_passive) {
+			if (type == THERMAL_TRIP_PASSIVE) {
+				params->trip_switch_on = i;
+				found_first_passive = true;
+			}
+		} else if (type == THERMAL_TRIP_PASSIVE) {
+			last_passive = i;
+		} else {
+			break;
+		}
+	}
+
+	if (last_passive != -1) {
+		params->trip_max_desired_temperature = last_passive;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void reset_pid_controller(struct power_allocator_params *params)
+{
+	params->err_integral = 0;
+	params->prev_err = 0;
+}
+
+static void allow_maximum_power(struct thermal_zone_device *tz)
+{
+	struct thermal_instance *instance;
+	struct power_allocator_params *params = tz->governor_data;
+
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if ((instance->trip != params->trip_max_desired_temperature) ||
+		    (!cdev_is_power_actor(instance->cdev)))
+			continue;
+
+		instance->target = 0;
+		instance->cdev->updated = false;
+		thermal_cdev_update(instance->cdev);
+	}
+}
+
+/**
+ * power_allocator_bind() - bind the power_allocator governor to a thermal zone
+ * @tz:	thermal zone to bind it to
+ *
+ * Check that the thermal zone is valid for this governor, that is, it
+ * has two thermal trips.  If so, initialize the PID controller
+ * parameters and bind it to the thermal zone.
+ *
+ * Return: 0 on success, -EINVAL if the trips were invalid or -ENOMEM
+ * if we ran out of memory.
+ */
+static int power_allocator_bind(struct thermal_zone_device *tz)
+{
+	int ret;
+	struct power_allocator_params *params;
+	unsigned long switch_on_temp, control_temp;
+	u32 temperature_threshold;
+
+	if (!tz->tzp || !tz->tzp->sustainable_power) {
+		dev_err(&tz->device,
+			"power_allocator: missing sustainable_power\n");
+		return -EINVAL;
+	}
+
+	params = kzalloc(sizeof(*params), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	ret = get_governor_trips(tz, params);
+	if (ret) {
+		dev_err(&tz->device,
+			"thermal zone %s has wrong trip setup for power allocator\n",
+			tz->type);
+		goto free;
+	}
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
+				     &switch_on_temp);
+	if (ret)
+		goto free;
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature,
+				     &control_temp);
+	if (ret)
+		goto free;
+
+	temperature_threshold = control_temp - switch_on_temp;
+
+	tz->tzp->k_po = tz->tzp->k_po ?:
+		int_to_frac(tz->tzp->sustainable_power) / temperature_threshold;
+	tz->tzp->k_pu = tz->tzp->k_pu ?:
+		int_to_frac(2 * tz->tzp->sustainable_power) /
+		temperature_threshold;
+	tz->tzp->k_i = tz->tzp->k_i ?: int_to_frac(10) / 1000;
+	/*
+	 * The default for k_d and integral_cutoff is 0, so we can
+	 * leave them as they are.
+	 */
+
+	reset_pid_controller(params);
+
+	tz->governor_data = params;
+
+	return 0;
+
+free:
+	kfree(params);
+	return ret;
+}
+
+static void power_allocator_unbind(struct thermal_zone_device *tz)
+{
+	dev_dbg(&tz->device, "Unbinding from thermal zone %d\n", tz->id);
+	kfree(tz->governor_data);
+	tz->governor_data = NULL;
+}
+
+static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
+{
+	int ret;
+	unsigned long switch_on_temp, control_temp, current_temp;
+	struct power_allocator_params *params = tz->governor_data;
+
+	/*
+	 * We get called for every trip point but we only need to do
+	 * our calculations once
+	 */
+	if (trip != params->trip_max_desired_temperature)
+		return 0;
+
+	ret = thermal_zone_get_temp(tz, &current_temp);
+	if (ret) {
+		dev_warn(&tz->device, "Failed to get temperature: %d\n", ret);
+		return ret;
+	}
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
+				     &switch_on_temp);
+	if (ret) {
+		dev_warn(&tz->device,
+			 "Failed to get switch on temperature: %d\n", ret);
+		return ret;
+	}
+
+	if (current_temp < switch_on_temp) {
+		tz->passive = 0;
+		reset_pid_controller(params);
+		allow_maximum_power(tz);
+		return 0;
+	}
+
+	tz->passive = 1;
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature,
+				&control_temp);
+	if (ret) {
+		dev_warn(&tz->device,
+			 "Failed to get the maximum desired temperature: %d\n",
+			 ret);
+		return ret;
+	}
+
+	return allocate_power(tz, current_temp, control_temp);
+}
+
+static struct thermal_governor thermal_gov_power_allocator = {
+	.name		= "power_allocator",
+	.bind_to_tz	= power_allocator_bind,
+	.unbind_from_tz	= power_allocator_unbind,
+	.throttle	= power_allocator_throttle,
+};
+
+int thermal_gov_power_allocator_register(void)
+{
+	return thermal_register_governor(&thermal_gov_power_allocator);
+}
+
+void thermal_gov_power_allocator_unregister(void)
+{
+	thermal_unregister_governor(&thermal_gov_power_allocator);
+}
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index a61386ce41ee..20b115711a0c 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -78,6 +78,58 @@ static struct thermal_governor *__find_governor(const char *name)
 	return NULL;
 }
 
+/**
+ * bind_previous_governor() - bind the previous governor of the thermal zone
+ * @tz:		a valid pointer to a struct thermal_zone_device
+ * @failed_gov_name:	the name of the governor that failed to register
+ *
+ * Register the previous governor of the thermal zone after a new
+ * governor has failed to be bound.
+ */
+static void bind_previous_governor(struct thermal_zone_device *tz,
+				   const char *failed_gov_name)
+{
+	if (tz->governor && tz->governor->bind_to_tz) {
+		if (tz->governor->bind_to_tz(tz)) {
+			dev_err(&tz->device,
+				"governor %s failed to bind and the previous one (%s) failed to bind again, thermal zone %s has no governor\n",
+				failed_gov_name, tz->governor->name, tz->type);
+			tz->governor = NULL;
+		}
+	}
+}
+
+/**
+ * thermal_set_governor() - Switch to another governor
+ * @tz:		a valid pointer to a struct thermal_zone_device
+ * @new_gov:	pointer to the new governor
+ *
+ * Change the governor of thermal zone @tz.
+ *
+ * Return: 0 on success, an error if the new governor's bind_to_tz() failed.
+ */
+static int thermal_set_governor(struct thermal_zone_device *tz,
+				struct thermal_governor *new_gov)
+{
+	int ret = 0;
+
+	if (tz->governor && tz->governor->unbind_from_tz)
+		tz->governor->unbind_from_tz(tz);
+
+	if (new_gov && new_gov->bind_to_tz) {
+		ret = new_gov->bind_to_tz(tz);
+		if (ret) {
+			bind_previous_governor(tz, new_gov->name);
+
+			return ret;
+		}
+	}
+
+	tz->governor = new_gov;
+
+	return ret;
+}
+
 int thermal_register_governor(struct thermal_governor *governor)
 {
 	int err;
@@ -110,8 +162,15 @@ int thermal_register_governor(struct thermal_governor *governor)
 
 		name = pos->tzp->governor_name;
 
-		if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH))
-			pos->governor = governor;
+		if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) {
+			int ret;
+
+			ret = thermal_set_governor(pos, governor);
+			if (ret)
+				dev_err(&pos->device,
+					"Failed to set governor %s for thermal zone %s: %d\n",
+					governor->name, pos->type, ret);
+		}
 	}
 
 	mutex_unlock(&thermal_list_lock);
@@ -137,7 +196,7 @@ void thermal_unregister_governor(struct thermal_governor *governor)
 	list_for_each_entry(pos, &thermal_tz_list, node) {
 		if (!strncasecmp(pos->governor->name, governor->name,
 						THERMAL_NAME_LENGTH))
-			pos->governor = NULL;
+			thermal_set_governor(pos, NULL);
 	}
 
 	mutex_unlock(&thermal_list_lock);
@@ -221,7 +280,8 @@ static void print_bind_err_msg(struct thermal_zone_device *tz,
 
 static void __bind(struct thermal_zone_device *tz, int mask,
 			struct thermal_cooling_device *cdev,
-			unsigned long *limits)
+			unsigned long *limits,
+			unsigned int weight)
 {
 	int i, ret;
 
@@ -236,7 +296,8 @@ static void __bind(struct thermal_zone_device *tz, int mask,
 				upper = limits[i * 2 + 1];
 			}
 			ret = thermal_zone_bind_cooling_device(tz, i, cdev,
-							       upper, lower);
+							       upper, lower,
+							       weight);
 			if (ret)
 				print_bind_err_msg(tz, cdev, ret);
 		}
@@ -283,7 +344,8 @@ static void bind_cdev(struct thermal_cooling_device *cdev)
 				continue;
 			tzp->tbp[i].cdev = cdev;
 			__bind(pos, tzp->tbp[i].trip_mask, cdev,
-			       tzp->tbp[i].binding_limits);
+			       tzp->tbp[i].binding_limits,
+			       tzp->tbp[i].weight);
 		}
 	}
 
@@ -322,7 +384,8 @@ static void bind_tz(struct thermal_zone_device *tz)
 				continue;
 			tzp->tbp[i].cdev = pos;
 			__bind(tz, tzp->tbp[i].trip_mask, pos,
-			       tzp->tbp[i].binding_limits);
+			       tzp->tbp[i].binding_limits,
+			       tzp->tbp[i].weight);
 		}
 	}
 exit:
@@ -737,7 +800,8 @@ passive_store(struct device *dev, struct device_attribute *attr,
 				thermal_zone_bind_cooling_device(tz,
 						THERMAL_TRIPS_NONE, cdev,
 						THERMAL_NO_LIMIT,
-						THERMAL_NO_LIMIT);
+						THERMAL_NO_LIMIT,
+						THERMAL_WEIGHT_DEFAULT);
 		}
 		mutex_unlock(&thermal_list_lock);
 		if (!tz->passive_delay)
@@ -789,8 +853,9 @@ policy_store(struct device *dev, struct device_attribute *attr,
 	if (!gov)
 		goto exit;
 
-	tz->governor = gov;
-	ret = count;
+	ret = thermal_set_governor(tz, gov);
+	if (!ret)
+		ret = count;
 
 exit:
 	mutex_unlock(&tz->lock);
@@ -834,6 +899,158 @@ emul_temp_store(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR(emul_temp, S_IWUSR, NULL, emul_temp_store);
 #endif/*CONFIG_THERMAL_EMULATION*/
 
+static ssize_t
+sustainable_power_show(struct device *dev, struct device_attribute *devattr,
+		       char *buf)
+{
+	struct thermal_zone_device *tz = to_thermal_zone(dev);
+
+	if (tz->tzp)
+		return sprintf(buf, "%u\n", tz->tzp->sustainable_power);
+	else
+		return -EIO;
+}
+
+static ssize_t
+sustainable_power_store(struct device *dev, struct device_attribute *devattr,
+			const char *buf, size_t count)
+{
+	struct thermal_zone_device *tz = to_thermal_zone(dev);
+	u32 sustainable_power;
+
+	if (!tz->tzp)
+		return -EIO;
+
+	if (kstrtou32(buf, 10, &sustainable_power))
+		return -EINVAL;
+
+	tz->tzp->sustainable_power = sustainable_power;
+
+	return count;
+}
+static DEVICE_ATTR(sustainable_power, S_IWUSR | S_IRUGO, sustainable_power_show,
+		sustainable_power_store);
+
+#define create_s32_tzp_attr(name)					\
+	static ssize_t							\
+	name##_show(struct device *dev, struct device_attribute *devattr, \
+		char *buf)						\
+	{								\
+	struct thermal_zone_device *tz = to_thermal_zone(dev);		\
+									\
+	if (tz->tzp)							\
+		return sprintf(buf, "%u\n", tz->tzp->name);		\
+	else								\
+		return -EIO;						\
+	}								\
+									\
+	static ssize_t							\
+	name##_store(struct device *dev, struct device_attribute *devattr, \
+		const char *buf, size_t count)				\
+	{								\
+		struct thermal_zone_device *tz = to_thermal_zone(dev);	\
+		s32 value;						\
+									\
+		if (!tz->tzp)						\
+			return -EIO;					\
+									\
+		if (kstrtos32(buf, 10, &value))				\
+			return -EINVAL;					\
+									\
+		tz->tzp->name = value;					\
+									\
+		return count;						\
+	}								\
+	static DEVICE_ATTR(name, S_IWUSR | S_IRUGO, name##_show, name##_store)
+
+create_s32_tzp_attr(k_po);
+create_s32_tzp_attr(k_pu);
+create_s32_tzp_attr(k_i);
+create_s32_tzp_attr(k_d);
+create_s32_tzp_attr(integral_cutoff);
+create_s32_tzp_attr(slope);
+create_s32_tzp_attr(offset);
+#undef create_s32_tzp_attr
+
+static struct device_attribute *dev_tzp_attrs[] = {
+	&dev_attr_sustainable_power,
+	&dev_attr_k_po,
+	&dev_attr_k_pu,
+	&dev_attr_k_i,
+	&dev_attr_k_d,
+	&dev_attr_integral_cutoff,
+	&dev_attr_slope,
+	&dev_attr_offset,
+};
+
+static int create_tzp_attrs(struct device *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(dev_tzp_attrs); i++) {
+		int ret;
+		struct device_attribute *dev_attr = dev_tzp_attrs[i];
+
+		ret = device_create_file(dev, dev_attr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * power_actor_get_max_power() - get the maximum power that a cdev can consume
+ * @cdev:	pointer to &thermal_cooling_device
+ * @tz:		a valid thermal zone device pointer
+ * @max_power:	pointer in which to store the maximum power
+ *
+ * Calculate the maximum power consumption in milliwats that the
+ * cooling device can currently consume and store it in @max_power.
+ *
+ * Return: 0 on success, -EINVAL if @cdev doesn't support the
+ * power_actor API or -E* on other error.
+ */
+int power_actor_get_max_power(struct thermal_cooling_device *cdev,
+			      struct thermal_zone_device *tz, u32 *max_power)
+{
+	if (!cdev_is_power_actor(cdev))
+		return -EINVAL;
+
+	return cdev->ops->state2power(cdev, tz, 0, max_power);
+}
+
+/**
+ * power_actor_set_power() - limit the maximum power that a cooling device can consume
+ * @cdev:	pointer to &thermal_cooling_device
+ * @instance:	thermal instance to update
+ * @power:	the power in milliwatts
+ *
+ * Set the cooling device to consume at most @power milliwatts.
+ *
+ * Return: 0 on success, -EINVAL if the cooling device does not
+ * implement the power actor API or -E* for other failures.
+ */
+int power_actor_set_power(struct thermal_cooling_device *cdev,
+			  struct thermal_instance *instance, u32 power)
+{
+	unsigned long state;
+	int ret;
+
+	if (!cdev_is_power_actor(cdev))
+		return -EINVAL;
+
+	ret = cdev->ops->power2state(cdev, instance->tz, power, &state);
+	if (ret)
+		return ret;
+
+	instance->target = state;
+	cdev->updated = false;
+	thermal_cdev_update(cdev);
+
+	return 0;
+}
+
 static DEVICE_ATTR(type, 0444, type_show, NULL);
 static DEVICE_ATTR(temp, 0444, temp_show, NULL);
 static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
@@ -941,6 +1158,34 @@ static const struct attribute_group *cooling_device_attr_groups[] = {
 	NULL,
 };
 
+static ssize_t
+thermal_cooling_device_weight_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct thermal_instance *instance;
+
+	instance = container_of(attr, struct thermal_instance, weight_attr);
+
+	return sprintf(buf, "%d\n", instance->weight);
+}
+
+static ssize_t
+thermal_cooling_device_weight_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct thermal_instance *instance;
+	int ret, weight;
+
+	ret = kstrtoint(buf, 0, &weight);
+	if (ret)
+		return ret;
+
+	instance = container_of(attr, struct thermal_instance, weight_attr);
+	instance->weight = weight;
+
+	return count;
+}
 /* Device management */
 
 /**
@@ -955,6 +1200,9 @@ static const struct attribute_group *cooling_device_attr_groups[] = {
  * @lower:	the Minimum cooling state can be used for this trip point.
  *		THERMAL_NO_LIMIT means no lower limit,
  *		and the cooling device can be in cooling state 0.
+ * @weight:	The weight of the cooling device to be bound to the
+ *		thermal zone. Use THERMAL_WEIGHT_DEFAULT for the
+ *		default value
  *
  * This interface function bind a thermal cooling device to the certain trip
  * point of a thermal zone device.
@@ -965,7 +1213,8 @@ static const struct attribute_group *cooling_device_attr_groups[] = {
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 				     int trip,
 				     struct thermal_cooling_device *cdev,
-				     unsigned long upper, unsigned long lower)
+				     unsigned long upper, unsigned long lower,
+				     unsigned int weight)
 {
 	struct thermal_instance *dev;
 	struct thermal_instance *pos;
@@ -1010,6 +1259,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	dev->upper = upper;
 	dev->lower = lower;
 	dev->target = THERMAL_NO_TARGET;
+	dev->weight = weight;
 
 	result = get_idr(&tz->idr, &tz->lock, &dev->id);
 	if (result)
@@ -1030,6 +1280,16 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	if (result)
 		goto remove_symbol_link;
 
+	sprintf(dev->weight_attr_name, "cdev%d_weight", dev->id);
+	sysfs_attr_init(&dev->weight_attr.attr);
+	dev->weight_attr.attr.name = dev->weight_attr_name;
+	dev->weight_attr.attr.mode = S_IWUSR | S_IRUGO;
+	dev->weight_attr.show = thermal_cooling_device_weight_show;
+	dev->weight_attr.store = thermal_cooling_device_weight_store;
+	result = device_create_file(&tz->device, &dev->weight_attr);
+	if (result)
+		goto remove_trip_file;
+
 	mutex_lock(&tz->lock);
 	mutex_lock(&cdev->lock);
 	list_for_each_entry(pos, &tz->thermal_instances, tz_node)
@@ -1048,6 +1308,8 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	if (!result)
 		return 0;
 
+	device_remove_file(&tz->device, &dev->weight_attr);
+remove_trip_file:
 	device_remove_file(&tz->device, &dev->attr);
 remove_symbol_link:
 	sysfs_remove_link(&tz->device.kobj, dev->name);
@@ -1409,7 +1671,8 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
 						tz->trip_temp_attrs[indx].name;
 		tz->trip_temp_attrs[indx].attr.attr.mode = S_IRUGO;
 		tz->trip_temp_attrs[indx].attr.show = trip_point_temp_show;
-		if (mask & (1 << indx)) {
+		if (IS_ENABLED(CONFIG_THERMAL_WRITABLE_TRIPS) &&
+		    mask & (1 << indx)) {
 			tz->trip_temp_attrs[indx].attr.attr.mode |= S_IWUSR;
 			tz->trip_temp_attrs[indx].attr.store =
 							trip_point_temp_store;
@@ -1486,7 +1749,7 @@ static void remove_trip_attrs(struct thermal_zone_device *tz)
 struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	int trips, int mask, void *devdata,
 	struct thermal_zone_device_ops *ops,
-	const struct thermal_zone_params *tzp,
+	struct thermal_zone_params *tzp,
 	int passive_delay, int polling_delay)
 {
 	struct thermal_zone_device *tz;
@@ -1495,6 +1758,7 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	int result;
 	int count;
 	int passive = 0;
+	struct thermal_governor *governor;
 
 	if (type && strlen(type) >= THERMAL_NAME_LENGTH)
 		return ERR_PTR(-EINVAL);
@@ -1589,13 +1853,24 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	if (result)
 		goto unregister;
 
+	/* Add thermal zone params */
+	result = create_tzp_attrs(&tz->device);
+	if (result)
+		goto unregister;
+
 	/* Update 'this' zone's governor information */
 	mutex_lock(&thermal_governor_lock);
 
 	if (tz->tzp)
-		tz->governor = __find_governor(tz->tzp->governor_name);
+		governor = __find_governor(tz->tzp->governor_name);
 	else
-		tz->governor = def_governor;
+		governor = def_governor;
+
+	result = thermal_set_governor(tz, governor);
+	if (result) {
+		mutex_unlock(&thermal_governor_lock);
+		goto unregister;
+	}
 
 	mutex_unlock(&thermal_governor_lock);
 
@@ -1687,7 +1962,7 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 		device_remove_file(&tz->device, &dev_attr_mode);
 	device_remove_file(&tz->device, &dev_attr_policy);
 	remove_trip_attrs(tz);
-	tz->governor = NULL;
+	thermal_set_governor(tz, NULL);
 
 	thermal_remove_hwmon_sysfs(tz);
 	release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
@@ -1843,7 +2118,11 @@ static int __init thermal_register_governors(void)
 	if (result)
 		return result;
 
-	return thermal_gov_user_space_register();
+	result = thermal_gov_user_space_register();
+	if (result)
+		return result;
+
+	return thermal_gov_power_allocator_register();
 }
 
 static void thermal_unregister_governors(void)
@@ -1852,6 +2131,7 @@ static void thermal_unregister_governors(void)
 	thermal_gov_fair_share_unregister();
 	thermal_gov_bang_bang_unregister();
 	thermal_gov_user_space_unregister();
+	thermal_gov_power_allocator_unregister();
 }
 
 static int thermal_pm_notify(struct notifier_block *nb,
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index dce86ee8e9d7..749d41abfbab 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -47,8 +47,11 @@ struct thermal_instance {
 	unsigned long target;	/* expected cooling state */
 	char attr_name[THERMAL_NAME_LENGTH];
 	struct device_attribute attr;
+	char weight_attr_name[THERMAL_NAME_LENGTH];
+	struct device_attribute weight_attr;
 	struct list_head tz_node; /* node in tz->thermal_instances */
 	struct list_head cdev_node; /* node in cdev->thermal_instances */
+	unsigned int weight; /* The weight of the cooling device */
 };
 
 int thermal_register_governor(struct thermal_governor *);
@@ -86,6 +89,14 @@ static inline int thermal_gov_user_space_register(void) { return 0; }
 static inline void thermal_gov_user_space_unregister(void) {}
 #endif /* CONFIG_THERMAL_GOV_USER_SPACE */
 
+#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
+int thermal_gov_power_allocator_register(void);
+void thermal_gov_power_allocator_unregister(void);
+#else
+static inline int thermal_gov_power_allocator_register(void) { return 0; }
+static inline void thermal_gov_power_allocator_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */
+
 /* device tree support */
 #ifdef CONFIG_THERMAL_OF
 int of_parse_thermal_zones(void);
diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index a38c1756442a..cb45e729adb5 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -146,7 +146,8 @@ static int ti_thermal_bind(struct thermal_zone_device *thermal,
 	return thermal_zone_bind_cooling_device(thermal, 0, cdev,
 	/* bind with min and max states defined by cpu_cooling */
 						THERMAL_NO_LIMIT,
-						THERMAL_NO_LIMIT);
+						THERMAL_NO_LIMIT,
+						THERMAL_WEIGHT_DEFAULT);
 }
 
 /* Unbind callback functions for thermal zone */
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 0b1e5a9449b5..991374b13571 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -55,7 +55,7 @@
 struct uwb_dbg {
 	struct uwb_pal pal;
 
-	u32 accept;
+	bool accept;
 	struct list_head rsvs;
 
 	struct dentry *root_d;
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7d092ddc8119..454017928ed0 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -21,7 +21,7 @@ config VFIO_VIRQFD
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
-	select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM_SMMU)
+	select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM_SMMU || ARM_SMMU_V3)
 	select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
 	select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES)
 	select ANON_INODES
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index c0f4ab83aaa8..c2337f80b2d9 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1059,19 +1059,21 @@ struct vfio_devices {
 static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
 {
 	struct vfio_devices *devs = data;
-	struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
-
-	if (pci_drv != &vfio_pci_driver)
-		return -EBUSY;
+	struct vfio_device *device;
 
 	if (devs->cur_index == devs->max_index)
 		return -ENOSPC;
 
-	devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev);
-	if (!devs->devices[devs->cur_index])
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (!device)
 		return -EINVAL;
 
-	devs->cur_index++;
+	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
+		vfio_device_put(device);
+		return -EBUSY;
+	}
+
+	devs->devices[devs->cur_index++] = device;
 	return 0;
 }
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index e1278fe04b1e..2fb29dfeffbd 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -661,18 +661,29 @@ int vfio_add_group_dev(struct device *dev,
 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 
 /**
- * Get a reference to the vfio_device for a device that is known to
- * be bound to a vfio driver.  The driver implicitly holds a
- * vfio_device reference between vfio_add_group_dev and
- * vfio_del_group_dev.  We can therefore use drvdata to increment
- * that reference from the struct device.  This additional
- * reference must be released by calling vfio_device_put.
+ * Get a reference to the vfio_device for a device.  Even if the
+ * caller thinks they own the device, they could be racing with a
+ * release call path, so we can't trust drvdata for the shortcut.
+ * Go the long way around, from the iommu_group to the vfio_group
+ * to the vfio_device.
  */
 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 {
-	struct vfio_device *device = dev_get_drvdata(dev);
+	struct iommu_group *iommu_group;
+	struct vfio_group *group;
+	struct vfio_device *device;
+
+	iommu_group = iommu_group_get(dev);
+	if (!iommu_group)
+		return NULL;
 
-	vfio_device_get(device);
+	group = vfio_group_get_from_iommu(iommu_group);
+	iommu_group_put(iommu_group);
+	if (!group)
+		return NULL;
+
+	device = vfio_group_get_device(group, dev);
+	vfio_group_put(group);
 
 	return device;
 }
diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
index 5fa42db769ee..38edeb4729a9 100644
--- a/drivers/vfio/vfio_spapr_eeh.c
+++ b/drivers/vfio/vfio_spapr_eeh.c
@@ -85,6 +85,16 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 		case VFIO_EEH_PE_CONFIGURE:
 			ret = eeh_pe_configure(pe);
 			break;
+		case VFIO_EEH_PE_INJECT_ERR:
+			minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
+			if (op.argsz < minsz)
+				return -EINVAL;
+			if (copy_from_user(&op, (void __user *)arg, minsz))
+				return -EFAULT;
+
+			ret = eeh_pe_inject_err(pe, op.err.type, op.err.func,
+						op.err.addr, op.err.mask);
+			break;
 		default:
 			ret = -EINVAL;
 		}