Cpuidle: Add a new scheduled governorsched-cpuidle-consolidation-v1

This patch introduces a new cpuidle governor. The governor selects a state based on precomputed idle period length and records the chosen state for use by the scheduler. Also included is additional code into kernel scheduler that does the precomputation. This patch is work in progress and is made availabe for informational purposes only. Its purpose is to provoke comments and discussion. The patch is not intended to be committed to any tree as-is and certainly no-one should attempt to upstream it anywhere. If this patch (in it's current form) is part of any such upstreaming proposal by mistake, please NAK this. :) Signed-off-by: Tuukka Tikkanen <tuukka.tikkanen@linaro.org>
author: Tuukka Tikkanen <tuukka.tikkanen@linaro.org> 2013-08-01 11:34:39 +0300
committer: Vincent Guittot <vincent.guittot@linaro.org> 2013-10-01 10:52:12 +0200
commit: aac68ec55a43dbca8753dd7c71e88b746f4d9777 (patch)
tree: f51bc9e15b37047fea286ceb99cc36727b762b28
parent: ea2b9345119fbe2de30c737fe5fd4a22129a0b4a (diff)
10 files changed, 300 insertions, 5 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index c711697f1343..3ca755653589 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -4,7 +4,7 @@ config CPU_IDLE
 	bool "CPU idle PM support"
 	default y if ACPI || PPC_PSERIES
 	select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE)
-	select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE)
+	select CPU_IDLE_GOV_MENU if ((NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_SCHEDULED)
 	help
 	  CPU idle is a generic framework for supporting software-controlled
 	  idle processor power management.  It includes modular cross-platform
@@ -32,6 +32,19 @@ config CPU_IDLE_GOV_MENU
 	select CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP
 	default y
 
+if SMP
+config CPU_IDLE_GOV_SCHEDULED
+	bool "Scheduled governor"
+	select CPU_IDLE_GOV_SHARED_PREDICTOR
+	select CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP
+	default n
+	help
+	 A governor that selects an idle state based on timing constraints
+	 set by another part of kernel. The state selection is recorded
+	 in a variable visible to other parts of the kernel so all cores
+	 may be considered when making scheduling decisions.
+endif
+
 config CPU_IDLE_GOV_SHARED_PREDICTOR
 	def_bool n
 
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
index 0e02d5c16720..1b3e2d6fd900 100644
--- a/drivers/cpuidle/governors/Makefile
+++ b/drivers/cpuidle/governors/Makefile
@@ -4,5 +4,6 @@
 
 obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
 obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+obj-$(CONFIG_CPU_IDLE_GOV_SCHEDULED) += scheduled.o
 obj-$(CONFIG_CPU_IDLE_GOV_SHARED_PREDICTOR) += wakeup_predict.o
 obj-$(CONFIG_CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP) += cstate_lookup.o
diff --git a/drivers/cpuidle/governors/scheduled.c b/drivers/cpuidle/governors/scheduled.c
new file mode 100644
index 000000000000..b597bd267932
--- /dev/null
+++ b/drivers/cpuidle/governors/scheduled.c
@@ -0,0 +1,113 @@
+/*
+ * scheduled.c - A governor that selects idle state based on external input
+ *
+ * Copyright 2013 Linaro Limited
+ * Author:
+ *        Tuukka Tikkanen <tuukka.tikkanen@linaro.org>
+ *
+ * This code is licenced under the GPL version 2 as described
+ * in the COPYING file that acompanies the Linux Kernel.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpuidle_scheduled.h>
+#include <linux/cpuidle.h>
+#include <linux/pm_qos.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+/**
+ * scheduled_select - selects the next idle state to enter
+ * @drv: cpuidle driver containing state data
+ * @dev: the CPU
+ */
+static int scheduled_select(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
+{
+	int state;
+	struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+
+	state = cpuidle_cstate_lookup(drv, dev,
+			pmdata->idle_time_until_timer,
+			pmdata->idle_length_estimate,
+			pmdata->idle_max_latency,
+			NULL);
+	pmdata->idle_current_state = &drv->states[state];
+
+	return state;
+}
+
+/**
+ * scheduled_reflect - records the actual idle period length
+ * @dev: the CPU
+ * @index: the index of actual entered state
+ */
+static void scheduled_reflect(struct cpuidle_device *dev, int index)
+{
+	unsigned int last_idle_us;
+	struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+	unsigned int timer_limit = pmdata->idle_time_until_timer;
+	struct cpuidle_state *state = pmdata->idle_current_state;
+
+	if (unlikely(!(state->flags & CPUIDLE_FLAG_TIME_VALID))) {
+		last_idle_us = timer_limit;
+	} else {
+		last_idle_us = cpuidle_get_last_residency(dev);
+		if (last_idle_us > state->exit_latency)
+			last_idle_us -= state->exit_latency;
+		if (last_idle_us > timer_limit)
+			last_idle_us = timer_limit;
+	}
+
+	cpuidle_scheduled_result(state, last_idle_us);
+
+	pmdata->idle_current_state = NULL;
+}
+
+
+/**
+ * scheduled_enable_device - reset per cpu variables after hotplug
+ * @drv: cpuidle driver
+ * @dev: the CPU
+ */
+static int scheduled_enable_device(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
+{
+	struct sched_pm *pmdata = &per_cpu(sched_stat, dev->cpu);
+
+	pmdata->idle_time_until_timer = UINT_MAX;
+	pmdata->idle_length_estimate = UINT_MAX;
+	pmdata->idle_max_latency = INT_MAX;
+	pmdata->idle_current_state = NULL;
+
+	return 0;
+}
+
+static struct cpuidle_governor scheduled_governor = {
+	.name =		"scheduled",
+	.rating =	100,
+	.enable =	scheduled_enable_device,
+	.select =	scheduled_select,
+	.reflect =	scheduled_reflect,
+	.owner =	THIS_MODULE,
+};
+
+/**
+ * init_scheduled_idle_gov - initializes the governor
+ */
+static int __init init_scheduled_idle_gov(void)
+{
+	return cpuidle_register_governor(&scheduled_governor);
+}
+
+/**
+ * exit_scheduled_idle_gov - exits the governor
+ */
+static void __exit exit_scheduled_idle_gov(void)
+{
+	cpuidle_unregister_governor(&scheduled_governor);
+}
+
+MODULE_LICENSE("GPL");
+module_init(init_scheduled_idle_gov);
+module_exit(exit_scheduled_idle_gov);
diff --git a/drivers/cpuidle/governors/wakeup_predict.c b/drivers/cpuidle/governors/wakeup_predict.c
index 8b9872f9dc41..7f0417835c5c 100644
--- a/drivers/cpuidle/governors/wakeup_predict.c
+++ b/drivers/cpuidle/governors/wakeup_predict.c
@@ -256,6 +256,13 @@ void wakeup_predictor_update(struct wakeup_predictor *pred,
 
 	pred->correction_factor[bucket] = new_factor;
 
+/*
+ * Hack for testing: don't update pattern data if no I/O
+ * This needs a more proper fix if this turns out to be good.
+ */
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+	if (unlikely(io_pending))
+#endif
 	/* update the repeating-pattern data */
 	pred->intervals[pred->interval_index++] = actual_us;
 	if (pred->interval_index >= INTERVALS) {
diff --git a/include/linux/cpuidle_scheduled.h b/include/linux/cpuidle_scheduled.h
new file mode 100644
index 000000000000..a50ec6348d5d
--- /dev/null
+++ b/include/linux/cpuidle_scheduled.h
@@ -0,0 +1,21 @@
+/*
+ * cpuidle_scheduled.h - interface for the scheduled CPU idle governor
+ *
+ * Copyright 2013 Linaro limited
+ *
+ * This code is licenced under the GPL version 2 as described
+ * in the COPYING file that acompanies the Linux Kernel.
+ */
+
+#ifndef _LINUX_CPUIDLE_SCHEDULED_H
+#define _LINUX_CPUIDLE_SCHEDULED_H
+
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+
+#include <linux/cpuidle.h>
+
+extern void cpuidle_scheduled_result(struct cpuidle_state *, unsigned int);
+
+#endif /* CONFIG_CPU_IDLE_GOV_SCHEDULED */
+
+#endif /* _LINUX_CPUIDLE_SCHEDULED_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d676aa297a9b..b7dca4e3d04f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,12 +63,16 @@ struct fs_struct;
 struct perf_event_context;
 struct blk_plug;
 
-/* This structure is used to share information and statistics with other
- * frameworks. It only shares wake up latency fro the moment but should be
- * extended with other usefull informations
+/*
+ * This structure is used to share information and statistics with other
+ * frameworks.
  */
 struct sched_pm {
-	atomic_t  wake_latency; /* time to wake up the cpu */
+	atomic_t	wake_latency; /* time to wake up the cpu */
+	int		idle_max_latency;
+	unsigned int	idle_time_until_timer;
+	unsigned int	idle_length_estimate;
+	struct cpuidle_state	*idle_current_state;
 };
 
 DECLARE_PER_CPU(struct sched_pm, sched_stat);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..aca231fb9dfe 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_IDLE_GOV_SCHEDULED) += idle_coop.o
diff --git a/kernel/sched/idle_coop.c b/kernel/sched/idle_coop.c
new file mode 100644
index 000000000000..32c18520c122
--- /dev/null
+++ b/kernel/sched/idle_coop.c
@@ -0,0 +1,128 @@
+#include "sched.h"
+#include <linux/cpuidle_scheduled.h>
+#include <linux/pm_qos.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+/*
+ * Scheduler co-operation with cpuidle
+ *
+ */
+
+static DEFINE_PER_CPU(struct wakeup_predictor *, core_predictor);
+
+void setup_scheduled_cpuidle(struct rq *rq)
+{
+	/*
+	 * This needs revisit, but the initial ideas are:
+	 * 0) The basis of latency requirement is the current value set
+	 *    through the pm QoS framework. This includes sources such
+	 *    as user space processes that may have information about
+	 *    required system responsiveness.
+	 * 1) Cores that don't handle interrups don't care about QoS limits
+	 *    set for interrupt latency. (Not handled in this version!)
+	 *    Likewise for cores not involved in handling interactive
+	 *    processes that may have set QoS request.
+	 * 2) Cores that are involved in unthrottled I/O (e.g. file copy)
+	 *    do have improved performance with low latency. However,
+	 *    depending on exact hardware and user requirements, the
+	 *    improved performance might not be necessary or even productive.
+	 * 3) Throttled I/O, e.g. MP3 playback, does not require low latency,
+	 *    unless the hardware audio buffer is really tiny and we can
+	 *    safely sleep in deep states.
+	 *
+	 * Latency requirement should reflect all of the above and nothing
+	 * else. In particular, the expected sleep duration or energy
+	 * break-even point of a state should not affect maximum latency
+	 * determination.
+	 *
+	 * Idle duration when there is no I/O should be based on history
+	 * of idle periods, where we try to correlate time to next timer
+	 * expiry to expected sleep duration. We do not try to seek any
+	 * sort of patterns in idle periods when there is no I/O.
+	 *
+	 * Idle duration when there is I/O is likely to be defined by
+	 * the speed of mass media device. Since this may differ by
+	 * several orders of magnitude, we do try to find a pattern and
+	 * use that as well.
+	 *
+	 * The number of processes sleeping on block devices / filesystems
+	 * I/O is quite accurately reported by rq->nr_iowait.
+	 * Other kind of I/O (e.g. audio) is not reflected there, as
+	 * those drivers tend to use functions such as sleep_on(), which
+	 * do not affect nr_iowait. Calls to io_schedule[_timeout]()
+	 * happen from files in block/, drivers/block/, drivers/md,
+	 * fs/ and mm/.
+	 *
+	 * NOTE: All of this really could still be done from menu.c, but
+	 * the intent is to improve this beyond the limits of being
+	 * outside of the scheduler.
+	 */
+	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+
+	unsigned int next_timer_us;
+	unsigned int predicted_us;
+	unsigned int repeating_us;
+	struct timespec t;
+	int nr_iowait = atomic_read(&rq->nr_iowait);
+	struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+	struct wakeup_predictor *pred = __this_cpu_read(core_predictor);
+
+	if (unlikely(pred == NULL)) {
+		pred = create_wakeup_predictor();
+		__this_cpu_write(core_predictor, pred);
+	}
+
+	/* Zero latency makes all other considerations obsolete */
+	if (unlikely(latency_req == 0)) {
+		pmdata->idle_time_until_timer = 0;
+		pmdata->idle_length_estimate = 0;
+		pmdata->idle_max_latency = 0;
+		return;
+	}
+
+	/*
+	 * For now assume all I/O is noncritical unless user space
+	 * did set a QoS restriction. If some sort of heuristics are
+	 * to be added, adjust the latency requirement here.
+	 *
+	 * (We should also be able to distinguish between drivers setting
+	 * latency limits for interrupt handling (and honor those only
+	 * on relevant core(s)) and user space requests. Currently that
+	 * is not possible.)
+	 */
+
+	/* Determine time to next timer expiry */
+	t = ktime_to_timespec(tick_nohz_get_sleep_length());
+	next_timer_us = t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC;
+
+	/* Always predict by timer scaling */
+	predicted_us = predict_scaled_wakeup(pred, next_timer_us, nr_iowait);
+
+	/*
+	 * In addition, if block I/O is pending, try to predict based on
+	 * recurring events (I/O completion).
+	 */
+	if (nr_iowait) {
+		repeating_us = predict_repeating_wakeup(pred);
+		if (repeating_us < predicted_us)
+			predicted_us = repeating_us;
+	}
+
+	pmdata->idle_time_until_timer = next_timer_us;
+	pmdata->idle_length_estimate = predicted_us;
+	pmdata->idle_max_latency = latency_req;
+}
+
+void cpuidle_scheduled_result(struct cpuidle_state *state,
+				unsigned int duration)
+{
+	struct wakeup_predictor *pred = __this_cpu_read(core_predictor);
+	struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+
+	/* Pred can be NULL after the initial sleep for non-boot cores */
+	if (unlikely(pred == NULL))
+		return;
+
+	wakeup_predictor_update(pred, pmdata->idle_time_until_timer,
+				0, duration);
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..e13cea79ebad 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,6 +23,7 @@ static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
 static void post_schedule_idle(struct rq *rq)
 {
 	idle_enter_fair(rq);
+	setup_scheduled_cpuidle(rq);
 }
 #endif /* CONFIG_SMP */
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d5a4ec0db08c..ad241bc13df7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1032,6 +1032,12 @@ static inline void update_packing_domain(int cpu) {};
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
 
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+extern void setup_scheduled_cpuidle(struct rq *rq);
+#else
+#define setup_scheduled_cpuidle(notused) { do { } while (0); }
+#endif
+
 #else	/* CONFIG_SMP */
 
 static inline void idle_balance(int cpu, struct rq *rq)
author	Tuukka Tikkanen <tuukka.tikkanen@linaro.org>	2013-08-01 11:34:39 +0300
committer	Vincent Guittot <vincent.guittot@linaro.org>	2013-10-01 10:52:12 +0200
commit	aac68ec55a43dbca8753dd7c71e88b746f4d9777 (patch)
tree	f51bc9e15b37047fea286ceb99cc36727b762b28
parent	ea2b9345119fbe2de30c737fe5fd4a22129a0b4a (diff)