aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/cpuidle/Kconfig15
-rw-r--r--drivers/cpuidle/governors/Makefile1
-rw-r--r--drivers/cpuidle/governors/scheduled.c113
-rw-r--r--drivers/cpuidle/governors/wakeup_predict.c7
-rw-r--r--include/linux/cpuidle_scheduled.h21
-rw-r--r--include/linux/sched.h12
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/idle_coop.c128
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/sched.h6
10 files changed, 300 insertions, 5 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index c711697f1343..3ca755653589 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -4,7 +4,7 @@ config CPU_IDLE
bool "CPU idle PM support"
default y if ACPI || PPC_PSERIES
select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE)
- select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE)
+ select CPU_IDLE_GOV_MENU if ((NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_SCHEDULED)
help
CPU idle is a generic framework for supporting software-controlled
idle processor power management. It includes modular cross-platform
@@ -32,6 +32,19 @@ config CPU_IDLE_GOV_MENU
select CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP
default y
+if SMP
+config CPU_IDLE_GOV_SCHEDULED
+ bool "Scheduled governor"
+ select CPU_IDLE_GOV_SHARED_PREDICTOR
+ select CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP
+ default n
+ help
+ A governor that selects an idle state based on timing constraints
+ set by another part of kernel. The state selection is recorded
+ in a variable visible to other parts of the kernel so all cores
+ may be considered when making scheduling decisions.
+endif
+
config CPU_IDLE_GOV_SHARED_PREDICTOR
def_bool n
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
index 0e02d5c16720..1b3e2d6fd900 100644
--- a/drivers/cpuidle/governors/Makefile
+++ b/drivers/cpuidle/governors/Makefile
@@ -4,5 +4,6 @@
obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+obj-$(CONFIG_CPU_IDLE_GOV_SCHEDULED) += scheduled.o
obj-$(CONFIG_CPU_IDLE_GOV_SHARED_PREDICTOR) += wakeup_predict.o
obj-$(CONFIG_CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP) += cstate_lookup.o
diff --git a/drivers/cpuidle/governors/scheduled.c b/drivers/cpuidle/governors/scheduled.c
new file mode 100644
index 000000000000..b597bd267932
--- /dev/null
+++ b/drivers/cpuidle/governors/scheduled.c
@@ -0,0 +1,113 @@
+/*
+ * scheduled.c - A governor that selects idle state based on external input
+ *
+ * Copyright 2013 Linaro Limited
+ * Author:
+ * Tuukka Tikkanen <tuukka.tikkanen@linaro.org>
+ *
+ * This code is licenced under the GPL version 2 as described
+ * in the COPYING file that acompanies the Linux Kernel.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpuidle_scheduled.h>
+#include <linux/cpuidle.h>
+#include <linux/pm_qos.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+/**
+ * scheduled_select - selects the next idle state to enter
+ * @drv: cpuidle driver containing state data
+ * @dev: the CPU
+ */
+static int scheduled_select(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev)
+{
+ int state;
+ struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+
+ state = cpuidle_cstate_lookup(drv, dev,
+ pmdata->idle_time_until_timer,
+ pmdata->idle_length_estimate,
+ pmdata->idle_max_latency,
+ NULL);
+ pmdata->idle_current_state = &drv->states[state];
+
+ return state;
+}
+
+/**
+ * scheduled_reflect - records the actual idle period length
+ * @dev: the CPU
+ * @index: the index of actual entered state
+ */
+static void scheduled_reflect(struct cpuidle_device *dev, int index)
+{
+ unsigned int last_idle_us;
+ struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+ unsigned int timer_limit = pmdata->idle_time_until_timer;
+ struct cpuidle_state *state = pmdata->idle_current_state;
+
+ if (unlikely(!(state->flags & CPUIDLE_FLAG_TIME_VALID))) {
+ last_idle_us = timer_limit;
+ } else {
+ last_idle_us = cpuidle_get_last_residency(dev);
+ if (last_idle_us > state->exit_latency)
+ last_idle_us -= state->exit_latency;
+ if (last_idle_us > timer_limit)
+ last_idle_us = timer_limit;
+ }
+
+ cpuidle_scheduled_result(state, last_idle_us);
+
+ pmdata->idle_current_state = NULL;
+}
+
+
+/**
+ * scheduled_enable_device - reset per cpu variables after hotplug
+ * @drv: cpuidle driver
+ * @dev: the CPU
+ */
+static int scheduled_enable_device(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev)
+{
+ struct sched_pm *pmdata = &per_cpu(sched_stat, dev->cpu);
+
+ pmdata->idle_time_until_timer = UINT_MAX;
+ pmdata->idle_length_estimate = UINT_MAX;
+ pmdata->idle_max_latency = INT_MAX;
+ pmdata->idle_current_state = NULL;
+
+ return 0;
+}
+
+static struct cpuidle_governor scheduled_governor = {
+ .name = "scheduled",
+ .rating = 100,
+ .enable = scheduled_enable_device,
+ .select = scheduled_select,
+ .reflect = scheduled_reflect,
+ .owner = THIS_MODULE,
+};
+
+/**
+ * init_scheduled_idle_gov - initializes the governor
+ */
+static int __init init_scheduled_idle_gov(void)
+{
+ return cpuidle_register_governor(&scheduled_governor);
+}
+
+/**
+ * exit_scheduled_idle_gov - exits the governor
+ */
+static void __exit exit_scheduled_idle_gov(void)
+{
+ cpuidle_unregister_governor(&scheduled_governor);
+}
+
+MODULE_LICENSE("GPL");
+module_init(init_scheduled_idle_gov);
+module_exit(exit_scheduled_idle_gov);
diff --git a/drivers/cpuidle/governors/wakeup_predict.c b/drivers/cpuidle/governors/wakeup_predict.c
index 8b9872f9dc41..7f0417835c5c 100644
--- a/drivers/cpuidle/governors/wakeup_predict.c
+++ b/drivers/cpuidle/governors/wakeup_predict.c
@@ -256,6 +256,13 @@ void wakeup_predictor_update(struct wakeup_predictor *pred,
pred->correction_factor[bucket] = new_factor;
+/*
+ * Hack for testing: don't update pattern data if no I/O
+ * This needs a more proper fix if this turns out to be good.
+ */
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+ if (unlikely(io_pending))
+#endif
/* update the repeating-pattern data */
pred->intervals[pred->interval_index++] = actual_us;
if (pred->interval_index >= INTERVALS) {
diff --git a/include/linux/cpuidle_scheduled.h b/include/linux/cpuidle_scheduled.h
new file mode 100644
index 000000000000..a50ec6348d5d
--- /dev/null
+++ b/include/linux/cpuidle_scheduled.h
@@ -0,0 +1,21 @@
+/*
+ * cpuidle_scheduled.h - interface for the scheduled CPU idle governor
+ *
+ * Copyright 2013 Linaro limited
+ *
+ * This code is licenced under the GPL version 2 as described
+ * in the COPYING file that acompanies the Linux Kernel.
+ */
+
+#ifndef _LINUX_CPUIDLE_SCHEDULED_H
+#define _LINUX_CPUIDLE_SCHEDULED_H
+
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+
+#include <linux/cpuidle.h>
+
+extern void cpuidle_scheduled_result(struct cpuidle_state *, unsigned int);
+
+#endif /* CONFIG_CPU_IDLE_GOV_SCHEDULED */
+
+#endif /* _LINUX_CPUIDLE_SCHEDULED_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d676aa297a9b..b7dca4e3d04f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,12 +63,16 @@ struct fs_struct;
struct perf_event_context;
struct blk_plug;
-/* This structure is used to share information and statistics with other
- * frameworks. It only shares wake up latency fro the moment but should be
- * extended with other usefull informations
+/*
+ * This structure is used to share information and statistics with other
+ * frameworks.
*/
struct sched_pm {
- atomic_t wake_latency; /* time to wake up the cpu */
+ atomic_t wake_latency; /* time to wake up the cpu */
+ int idle_max_latency;
+ unsigned int idle_time_until_timer;
+ unsigned int idle_length_estimate;
+ struct cpuidle_state *idle_current_state;
};
DECLARE_PER_CPU(struct sched_pm, sched_stat);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..aca231fb9dfe 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_IDLE_GOV_SCHEDULED) += idle_coop.o
diff --git a/kernel/sched/idle_coop.c b/kernel/sched/idle_coop.c
new file mode 100644
index 000000000000..32c18520c122
--- /dev/null
+++ b/kernel/sched/idle_coop.c
@@ -0,0 +1,128 @@
+#include "sched.h"
+#include <linux/cpuidle_scheduled.h>
+#include <linux/pm_qos.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+/*
+ * Scheduler co-operation with cpuidle
+ *
+ */
+
+static DEFINE_PER_CPU(struct wakeup_predictor *, core_predictor);
+
+void setup_scheduled_cpuidle(struct rq *rq)
+{
+ /*
+ * This needs revisit, but the initial ideas are:
+ * 0) The basis of latency requirement is the current value set
+ * through the pm QoS framework. This includes sources such
+ * as user space processes that may have information about
+ * required system responsiveness.
+ * 1) Cores that don't handle interrups don't care about QoS limits
+ * set for interrupt latency. (Not handled in this version!)
+ * Likewise for cores not involved in handling interactive
+ * processes that may have set QoS request.
+ * 2) Cores that are involved in unthrottled I/O (e.g. file copy)
+ * do have improved performance with low latency. However,
+ * depending on exact hardware and user requirements, the
+ * improved performance might not be necessary or even productive.
+ * 3) Throttled I/O, e.g. MP3 playback, does not require low latency,
+ * unless the hardware audio buffer is really tiny and we can
+ * safely sleep in deep states.
+ *
+ * Latency requirement should reflect all of the above and nothing
+ * else. In particular, the expected sleep duration or energy
+ * break-even point of a state should not affect maximum latency
+ * determination.
+ *
+ * Idle duration when there is no I/O should be based on history
+ * of idle periods, where we try to correlate time to next timer
+ * expiry to expected sleep duration. We do not try to seek any
+ * sort of patterns in idle periods when there is no I/O.
+ *
+ * Idle duration when there is I/O is likely to be defined by
+ * the speed of mass media device. Since this may differ by
+ * several orders of magnitude, we do try to find a pattern and
+ * use that as well.
+ *
+ * The number of processes sleeping on block devices / filesystems
+ * I/O is quite accurately reported by rq->nr_iowait.
+ * Other kind of I/O (e.g. audio) is not reflected there, as
+ * those drivers tend to use functions such as sleep_on(), which
+ * do not affect nr_iowait. Calls to io_schedule[_timeout]()
+ * happen from files in block/, drivers/block/, drivers/md,
+ * fs/ and mm/.
+ *
+ * NOTE: All of this really could still be done from menu.c, but
+ * the intent is to improve this beyond the limits of being
+ * outside of the scheduler.
+ */
+ int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+
+ unsigned int next_timer_us;
+ unsigned int predicted_us;
+ unsigned int repeating_us;
+ struct timespec t;
+ int nr_iowait = atomic_read(&rq->nr_iowait);
+ struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+ struct wakeup_predictor *pred = __this_cpu_read(core_predictor);
+
+ if (unlikely(pred == NULL)) {
+ pred = create_wakeup_predictor();
+ __this_cpu_write(core_predictor, pred);
+ }
+
+ /* Zero latency makes all other considerations obsolete */
+ if (unlikely(latency_req == 0)) {
+ pmdata->idle_time_until_timer = 0;
+ pmdata->idle_length_estimate = 0;
+ pmdata->idle_max_latency = 0;
+ return;
+ }
+
+ /*
+ * For now assume all I/O is noncritical unless user space
+ * did set a QoS restriction. If some sort of heuristics are
+ * to be added, adjust the latency requirement here.
+ *
+ * (We should also be able to distinguish between drivers setting
+ * latency limits for interrupt handling (and honor those only
+ * on relevant core(s)) and user space requests. Currently that
+ * is not possible.)
+ */
+
+ /* Determine time to next timer expiry */
+ t = ktime_to_timespec(tick_nohz_get_sleep_length());
+ next_timer_us = t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC;
+
+ /* Always predict by timer scaling */
+ predicted_us = predict_scaled_wakeup(pred, next_timer_us, nr_iowait);
+
+ /*
+ * In addition, if block I/O is pending, try to predict based on
+ * recurring events (I/O completion).
+ */
+ if (nr_iowait) {
+ repeating_us = predict_repeating_wakeup(pred);
+ if (repeating_us < predicted_us)
+ predicted_us = repeating_us;
+ }
+
+ pmdata->idle_time_until_timer = next_timer_us;
+ pmdata->idle_length_estimate = predicted_us;
+ pmdata->idle_max_latency = latency_req;
+}
+
+void cpuidle_scheduled_result(struct cpuidle_state *state,
+ unsigned int duration)
+{
+ struct wakeup_predictor *pred = __this_cpu_read(core_predictor);
+ struct sched_pm *pmdata = this_cpu_ptr(&sched_stat);
+
+ /* Pred can be NULL after the initial sleep for non-boot cores */
+ if (unlikely(pred == NULL))
+ return;
+
+ wakeup_predictor_update(pred, pmdata->idle_time_until_timer,
+ 0, duration);
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..e13cea79ebad 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,6 +23,7 @@ static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
static void post_schedule_idle(struct rq *rq)
{
idle_enter_fair(rq);
+ setup_scheduled_cpuidle(rq);
}
#endif /* CONFIG_SMP */
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d5a4ec0db08c..ad241bc13df7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1032,6 +1032,12 @@ static inline void update_packing_domain(int cpu) {};
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
+#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED
+extern void setup_scheduled_cpuidle(struct rq *rq);
+#else
+#define setup_scheduled_cpuidle(notused) { do { } while (0); }
+#endif
+
#else /* CONFIG_SMP */
static inline void idle_balance(int cpu, struct rq *rq)