10 files changed, 300 insertions, 5 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index c711697f1343..3ca755653589 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -4,7 +4,7 @@ config CPU_IDLE
 	bool "CPU idle PM support"
 	default y if ACPI || PPC_PSERIES
 	select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE)
-	select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE)
+	select CPU_IDLE_GOV_MENU if ((NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_SCHEDULED)
 	help
 	  CPU idle is a generic framework for supporting software-controlled
 	  idle processor power management.  It includes modular cross-platform
@@ -32,6 +32,19 @@ config CPU_IDLE_GOV_MENU
 	select CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP
 	default y
 
+if SMP
+config CPU_IDLE_GOV_SCHEDULED
+	bool "Scheduled governor"
+	select CPU_IDLE_GOV_SHARED_PREDICTOR
+	select CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP
+	default n
+	help
+	 A governor that selects an idle state based on timing constraints
+	 set by another part of kernel. The state selection is recorded
+	 in a variable visible to other parts of the kernel so all cores
+	 may be considered when making scheduling decisions.
+endif
+
 config CPU_IDLE_GOV_SHARED_PREDICTOR
 	def_bool n
 
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
index 0e02d5c16720..1b3e2d6fd900 100644
--- a/drivers/cpuidle/governors/Makefile
+++ b/drivers/cpuidle/governors/Makefile
@@ -4,5 +4,6 @@
 
 obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
 obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+obj-$(CONFIG_CPU_IDLE_GOV_SCHEDULED) += scheduled.o
 obj-$(CONFIG_CPU_IDLE_GOV_SHARED_PREDICTOR) += wakeup_predict.o
 obj-$(CONFIG_CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP) += cstate_lookup.o
diff --git a/drivers/cpuidle/governors/scheduled.c b/drivers/cpuidle/governors/scheduled.c
new file mode 100644
index 000000000000..b597bd267932
--- /dev/null
+++ b/drivers/cpuidle/governors/scheduled.c
@@ -0,0 +1,113 @@
+/*
+ * scheduled.c - A governor that selects idle state based on external input
+ *
+ * Copyright 2013 Linaro Limited
+ * Author:
+ *        Tuukka Tikkanen <tuukka.tikkanen@linaro.org>
+ *
+ * This code is licenced under the GPL version 2 as described
+ * in the COPYING file that acompanies the Linux Kernel.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpuidle_scheduled.h>
+#include <linux/cpuidle.h>
+#include <linux/pm_qos.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+/**
+ * scheduled_select - selects the next idle state to enter
+ * @drv: cpuidle driver containing state data
+ * @dev: the CPU
+ */
+static int scheduled_select(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
+{
+	int state;
+	struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+
+	state = cpuidle_cstate_lookup(drv, dev,
+			pmdata->idle_time_until_timer,
+			pmdata->idle_length_estimate,
+			pmdata->idle_max_latency,
+			NULL);
+	pmdata->idle_current_state = &drv->states[state];
+
+	return state;
+}
+
+/**
+ * scheduled_reflect - records the actual idle period length
+ * @dev: the CPU
+ * @index: the index of actual entered state
+ */
+static void scheduled_reflect(struct cpuidle_device *dev, int index)
+{
+	unsigned int last_idle_us;
+	struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+	unsigned int timer_limit = pmdata->idle_time_until_timer;
+	struct cpuidle_state *state = pmdata->idle_current_state;
+
+	if (unlikely(!(state->flags & CPUIDLE_FLAG_TIME_VALID))) {
+		last_idle_us = timer_limit;
+	} else {
+		last_idle_us = cpuidle_get_last_residency(dev);
+		if (last_idle_us > state->exit_latency)
+			last_idle_us -= state->exit_latency;
+		if (last_idle_us > timer_limit)
+			last_idle_us = timer_limit;
+	}
+
+	cpuidle_scheduled_result(state, last_idle_us);
+
+	pmdata->idle_current_state = NULL;
+}
+
+
+/**
+ * scheduled_enable_device - reset per cpu variables after hotplug
+ * @drv: cpuidle driver
+ * @dev: the CPU
+ */
+static int scheduled_enable_device(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
+{
+	struct sched_pm *pmdata = &per_cpu(sched_stat, dev->cpu);
+
+	pmdata->idle_time_until_timer = UINT_MAX;
+	pmdata->idle_length_estimate = UINT_MAX;
+	pmdata->idle_max_latency = INT_MAX;
+	pmdata->idle_current_state = NULL;
+
+	return 0;
+}
+
+static struct cpuidle_governor scheduled_governor = {
+	.name =		"scheduled",
+	.rating =	100,
+	.enable =	scheduled_enable_device,
+	.select =	scheduled_select,
+	.reflect =	scheduled_reflect,
+	.owner =	THIS_MODULE,
+};
+
+/**
+ * init_scheduled_idle_gov - initializes the governor
+ */
+static int __init init_scheduled_idle_gov(void)
+{
+	return cpuidle_register_governor(&scheduled_governor);
+}
+
+/**
+ * exit_scheduled_idle_gov - exits the governor
+ */
+static void __exit exit_scheduled_idle_gov(void)
+{
+	cpuidle_unregister_governor(&scheduled_governor);
+}
+
+MODULE_LICENSE("GPL");
+module_init(init_scheduled_idle_gov);
+module_exit(exit_scheduled_idle_gov);
diff --git a/drivers/cpuidle/governors/wakeup_predict.c b/drivers/cpuidle/governors/wakeup_predict.c
index 8b9872f9dc41..7f0417835c5c 100644
--- a/drivers/cpuidle/governors/wakeup_predict.c
+++ b/drivers/cpuidle/governors/wakeup_predict.c
@@ -256,6 +256,13 @@ void wakeup_predictor_update(struct wakeup_predictor *pred,
 
 	pred->correction_factor[bucket] = new_factor;
 
+/*
+ * Hack for testing: don't update pattern data if no I/O
+ * This needs a more proper fix if this turns out to be good.
+ */
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+	if (unlikely(io_pending))
+#endif
 	/* update the repeating-pattern data */
 	pred->intervals[pred->interval_index++] = actual_us;
 	if (pred->interval_index >= INTERVALS) {
diff --git a/include/linux/cpuidle_scheduled.h b/include/linux/cpuidle_scheduled.h
new file mode 100644
index 000000000000..a50ec6348d5d
--- /dev/null
+++ b/include/linux/cpuidle_scheduled.h
@@ -0,0 +1,21 @@
+/*
+ * cpuidle_scheduled.h - interface for the scheduled CPU idle governor
+ *
+ * Copyright 2013 Linaro limited
+ *
+ * This code is licenced under the GPL version 2 as described
+ * in the COPYING file that acompanies the Linux Kernel.
+ */
+
+#ifndef _LINUX_CPUIDLE_SCHEDULED_H
+#define _LINUX_CPUIDLE_SCHEDULED_H
+
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+
+#include <linux/cpuidle.h>
+
+extern void cpuidle_scheduled_result(struct cpuidle_state *, unsigned int);
+
+#endif /* CONFIG_CPU_IDLE_GOV_SCHEDULED */
+
+#endif /* _LINUX_CPUIDLE_SCHEDULED_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d676aa297a9b..b7dca4e3d04f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,12 +63,16 @@ struct fs_struct;
 struct perf_event_context;
 struct blk_plug;
 
-/* This structure is used to share information and statistics with other
- * frameworks. It only shares wake up latency fro the moment but should be
- * extended with other usefull informations
+/*
+ * This structure is used to share information and statistics with other
+ * frameworks.
  */
 struct sched_pm {
-	atomic_t  wake_latency; /* time to wake up the cpu */
+	atomic_t	wake_latency; /* time to wake up the cpu */
+	int		idle_max_latency;
+	unsigned int	idle_time_until_timer;
+	unsigned int	idle_length_estimate;
+	struct cpuidle_state	*idle_current_state;
 };
 
 DECLARE_PER_CPU(struct sched_pm, sched_stat);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..aca231fb9dfe 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_IDLE_GOV_SCHEDULED) += idle_coop.o
diff --git a/kernel/sched/idle_coop.c b/kernel/sched/idle_coop.c
new file mode 100644
index 000000000000..32c18520c122
--- /dev/null
+++ b/kernel/sched/idle_coop.c
@@ -0,0 +1,128 @@
+#include "sched.h"
+#include <linux/cpuidle_scheduled.h>
+#include <linux/pm_qos.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+/*
+ * Scheduler co-operation with cpuidle
+ *
+ */
+
+static DEFINE_PER_CPU(struct wakeup_predictor *, core_predictor);
+
+void setup_scheduled_cpuidle(struct rq *rq)
+{
+	/*
+	 * This needs revisit, but the initial ideas are:
+	 * 0) The basis of latency requirement is the current value set
+	 *    through the pm QoS framework. This includes sources such
+	 *    as user space processes that may have information about
+	 *    required system responsiveness.
+	 * 1) Cores that don't handle interrups don't care about QoS limits
+	 *    set for interrupt latency. (Not handled in this version!)
+	 *    Likewise for cores not involved in handling interactive
+	 *    processes that may have set QoS request.
+	 * 2) Cores that are involved in unthrottled I/O (e.g. file copy)
+	 *    do have improved performance with low latency. However,
+	 *    depending on exact hardware and user requirements, the
+	 *    improved performance might not be necessary or even productive.
+	 * 3) Throttled I/O, e.g. MP3 playback, does not require low latency,
+	 *    unless the hardware audio buffer is really tiny and we can
+	 *    safely sleep in deep states.
+	 *
+	 * Latency requirement should reflect all of the above and nothing
+	 * else. In particular, the expected sleep duration or energy
+	 * break-even point of a state should not affect maximum latency
+	 * determination.
+	 *
+	 * Idle duration when there is no I/O should be based on history
+	 * of idle periods, where we try to correlate time to next timer
+	 * expiry to expected sleep duration. We do not try to seek any
+	 * sort of patterns in idle periods when there is no I/O.
+	 *
+	 * Idle duration when there is I/O is likely to be defined by
+	 * the speed of mass media device. Since this may differ by
+	 * several orders of magnitude, we do try to find a pattern and
+	 * use that as well.
+	 *
+	 * The number of processes sleeping on block devices / filesystems
+	 * I/O is quite accurately reported by rq->nr_iowait.
+	 * Other kind of I/O (e.g. audio) is not reflected there, as
+	 * those drivers tend to use functions such as sleep_on(), which
+	 * do not affect nr_iowait. Calls to io_schedule[_timeout]()
+	 * happen from files in block/, drivers/block/, drivers/md,
+	 * fs/ and mm/.
+	 *
+	 * NOTE: All of this really could still be done from menu.c, but
+	 * the intent is to improve this beyond the limits of being
+	 * outside of the scheduler.
+	 */
+	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+
+	unsigned int next_timer_us;
+	unsigned int predicted_us;
+	unsigned int repeating_us;
+	struct timespec t;
+	int nr_iowait = atomic_read(&rq->nr_iowait);
+	struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+	struct wakeup_predictor *pred = __this_cpu_read(core_predictor);
+
+	if (unlikely(pred == NULL)) {
+		pred = create_wakeup_predictor();
+		__this_cpu_write(core_predictor, pred);
+	}
+
+	/* Zero latency makes all other considerations obsolete */
+	if (unlikely(latency_req == 0)) {
+		pmdata->idle_time_until_timer = 0;
+		pmdata->idle_length_estimate = 0;
+		pmdata->idle_max_latency = 0;
+		return;
+	}
+
+	/*
+	 * For now assume all I/O is noncritical unless user space
+	 * did set a QoS restriction. If some sort of heuristics are
+	 * to be added, adjust the latency requirement here.
+	 *
+	 * (We should also be able to distinguish between drivers setting
+	 * latency limits for interrupt handling (and honor those only
+	 * on relevant core(s)) and user space requests. Currently that
+	 * is not possible.)
+	 */
+
+	/* Determine time to next timer expiry */
+	t = ktime_to_timespec(tick_nohz_get_sleep_length());
+	next_timer_us = t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC;
+
+	/* Always predict by timer scaling */
+	predicted_us = predict_scaled_wakeup(pred, next_timer_us, nr_iowait);
+
+	/*
+	 * In addition, if block I/O is pending, try to predict based on
+	 * recurring events (I/O completion).
+	 */
+	if (nr_iowait) {
+		repeating_us = predict_repeating_wakeup(pred);
+		if (repeating_us < predicted_us)
+			predicted_us = repeating_us;
+	}
+
+	pmdata->idle_time_until_timer = next_timer_us;
+	pmdata->idle_length_estimate = predicted_us;
+	pmdata->idle_max_latency = latency_req;
+}
+
+void cpuidle_scheduled_result(struct cpuidle_state *state,
+				unsigned int duration)
+{
+	struct wakeup_predictor *pred = __this_cpu_read(core_predictor);
+	struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+
+	/* Pred can be NULL after the initial sleep for non-boot cores */
+	if (unlikely(pred == NULL))
+		return;
+
+	wakeup_predictor_update(pred, pmdata->idle_time_until_timer,
+				0, duration);
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..e13cea79ebad 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,6 +23,7 @@ static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
 static void post_schedule_idle(struct rq *rq)
 {
 	idle_enter_fair(rq);
+	setup_scheduled_cpuidle(rq);
 }
 #endif /* CONFIG_SMP */
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d5a4ec0db08c..ad241bc13df7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1032,6 +1032,12 @@ static inline void update_packing_domain(int cpu) {};
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
 
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+extern void setup_scheduled_cpuidle(struct rq *rq);
+#else
+#define setup_scheduled_cpuidle(notused) { do { } while (0); }
+#endif
+
 #else	/* CONFIG_SMP */
 
 static inline void idle_balance(int cpu, struct rq *rq)