diff options
-rw-r--r-- | drivers/cpuidle/Kconfig | 15 | ||||
-rw-r--r-- | drivers/cpuidle/governors/Makefile | 1 | ||||
-rw-r--r-- | drivers/cpuidle/governors/scheduled.c | 113 | ||||
-rw-r--r-- | drivers/cpuidle/governors/wakeup_predict.c | 7 | ||||
-rw-r--r-- | include/linux/cpuidle_scheduled.h | 21 | ||||
-rw-r--r-- | include/linux/sched.h | 12 | ||||
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/idle_coop.c | 128 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
-rw-r--r-- | kernel/sched/sched.h | 6 |
10 files changed, 300 insertions, 5 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index c711697f1343..3ca755653589 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -4,7 +4,7 @@ config CPU_IDLE bool "CPU idle PM support" default y if ACPI || PPC_PSERIES select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE) - select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) + select CPU_IDLE_GOV_MENU if ((NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_SCHEDULED) help CPU idle is a generic framework for supporting software-controlled idle processor power management. It includes modular cross-platform @@ -32,6 +32,19 @@ config CPU_IDLE_GOV_MENU select CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP default y +if SMP +config CPU_IDLE_GOV_SCHEDULED + bool "Scheduled governor" + select CPU_IDLE_GOV_SHARED_PREDICTOR + select CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP + default n + help + A governor that selects an idle state based on timing constraints + set by another part of kernel. The state selection is recorded + in a variable visible to other parts of the kernel so all cores + may be considered when making scheduling decisions. +endif + config CPU_IDLE_GOV_SHARED_PREDICTOR def_bool n diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile index 0e02d5c16720..1b3e2d6fd900 100644 --- a/drivers/cpuidle/governors/Makefile +++ b/drivers/cpuidle/governors/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o +obj-$(CONFIG_CPU_IDLE_GOV_SCHEDULED) += scheduled.o obj-$(CONFIG_CPU_IDLE_GOV_SHARED_PREDICTOR) += wakeup_predict.o obj-$(CONFIG_CPU_IDLE_GOV_SHARED_CSTATE_LOOKUP) += cstate_lookup.o diff --git a/drivers/cpuidle/governors/scheduled.c b/drivers/cpuidle/governors/scheduled.c new file mode 100644 index 000000000000..b597bd267932 --- /dev/null +++ b/drivers/cpuidle/governors/scheduled.c @@ -0,0 +1,113 @@ +/* + * scheduled.c - A governor that selects idle state based on external input + * + * Copyright 2013 Linaro Limited + * Author: + * Tuukka Tikkanen <tuukka.tikkanen@linaro.org> + * + * This code is licenced under the GPL version 2 as described + * in the COPYING file that acompanies the Linux Kernel. + */ + +#include <linux/kernel.h> +#include <linux/cpuidle_scheduled.h> +#include <linux/cpuidle.h> +#include <linux/pm_qos.h> +#include <linux/module.h> +#include <linux/sched.h> + +/** + * scheduled_select - selects the next idle state to enter + * @drv: cpuidle driver containing state data + * @dev: the CPU + */ +static int scheduled_select(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + int state; + struct sched_pm *pmdata = this_cpu_ptr(&sched_stat); + + state = cpuidle_cstate_lookup(drv, dev, + pmdata->idle_time_until_timer, + pmdata->idle_length_estimate, + pmdata->idle_max_latency, + NULL); + pmdata->idle_current_state = &drv->states[state]; + + return state; +} + +/** + * scheduled_reflect - records the actual idle period length + * @dev: the CPU + * @index: the index of actual entered state + */ +static void scheduled_reflect(struct cpuidle_device *dev, int index) +{ + unsigned int last_idle_us; + struct sched_pm *pmdata = this_cpu_ptr(&sched_stat); + unsigned int timer_limit = pmdata->idle_time_until_timer; + struct cpuidle_state *state = pmdata->idle_current_state; + + if (unlikely(!(state->flags & CPUIDLE_FLAG_TIME_VALID))) { + last_idle_us = timer_limit; + } else { + last_idle_us = cpuidle_get_last_residency(dev); + if (last_idle_us > state->exit_latency) + last_idle_us -= state->exit_latency; + if (last_idle_us > timer_limit) + last_idle_us = timer_limit; + } + + cpuidle_scheduled_result(state, last_idle_us); + + pmdata->idle_current_state = NULL; +} + + +/** + * scheduled_enable_device - reset per cpu variables after hotplug + * @drv: cpuidle driver + * @dev: the CPU + */ +static int scheduled_enable_device(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + struct sched_pm *pmdata = &per_cpu(sched_stat, dev->cpu); + + pmdata->idle_time_until_timer = UINT_MAX; + pmdata->idle_length_estimate = UINT_MAX; + pmdata->idle_max_latency = INT_MAX; + pmdata->idle_current_state = NULL; + + return 0; +} + +static struct cpuidle_governor scheduled_governor = { + .name = "scheduled", + .rating = 100, + .enable = scheduled_enable_device, + .select = scheduled_select, + .reflect = scheduled_reflect, + .owner = THIS_MODULE, +}; + +/** + * init_scheduled_idle_gov - initializes the governor + */ +static int __init init_scheduled_idle_gov(void) +{ + return cpuidle_register_governor(&scheduled_governor); +} + +/** + * exit_scheduled_idle_gov - exits the governor + */ +static void __exit exit_scheduled_idle_gov(void) +{ + cpuidle_unregister_governor(&scheduled_governor); +} + +MODULE_LICENSE("GPL"); +module_init(init_scheduled_idle_gov); +module_exit(exit_scheduled_idle_gov); diff --git a/drivers/cpuidle/governors/wakeup_predict.c b/drivers/cpuidle/governors/wakeup_predict.c index 8b9872f9dc41..7f0417835c5c 100644 --- a/drivers/cpuidle/governors/wakeup_predict.c +++ b/drivers/cpuidle/governors/wakeup_predict.c @@ -256,6 +256,13 @@ void wakeup_predictor_update(struct wakeup_predictor *pred, pred->correction_factor[bucket] = new_factor; +/* + * Hack for testing: don't update pattern data if no I/O + * This needs a more proper fix if this turns out to be good. + */ +#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED + if (unlikely(io_pending)) +#endif /* update the repeating-pattern data */ pred->intervals[pred->interval_index++] = actual_us; if (pred->interval_index >= INTERVALS) { diff --git a/include/linux/cpuidle_scheduled.h b/include/linux/cpuidle_scheduled.h new file mode 100644 index 000000000000..a50ec6348d5d --- /dev/null +++ b/include/linux/cpuidle_scheduled.h @@ -0,0 +1,21 @@ +/* + * cpuidle_scheduled.h - interface for the scheduled CPU idle governor + * + * Copyright 2013 Linaro limited + * + * This code is licenced under the GPL version 2 as described + * in the COPYING file that acompanies the Linux Kernel. + */ + +#ifndef _LINUX_CPUIDLE_SCHEDULED_H +#define _LINUX_CPUIDLE_SCHEDULED_H + +#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED + +#include <linux/cpuidle.h> + +extern void cpuidle_scheduled_result(struct cpuidle_state *, unsigned int); + +#endif /* CONFIG_CPU_IDLE_GOV_SCHEDULED */ + +#endif /* _LINUX_CPUIDLE_SCHEDULED_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d676aa297a9b..b7dca4e3d04f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,12 +63,16 @@ struct fs_struct; struct perf_event_context; struct blk_plug; -/* This structure is used to share information and statistics with other - * frameworks. It only shares wake up latency fro the moment but should be - * extended with other usefull informations +/* + * This structure is used to share information and statistics with other + * frameworks. */ struct sched_pm { - atomic_t wake_latency; /* time to wake up the cpu */ + atomic_t wake_latency; /* time to wake up the cpu */ + int idle_max_latency; + unsigned int idle_time_until_timer; + unsigned int idle_length_estimate; + struct cpuidle_state *idle_current_state; }; DECLARE_PER_CPU(struct sched_pm, sched_stat); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f495..aca231fb9dfe 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -17,3 +17,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_IDLE_GOV_SCHEDULED) += idle_coop.o diff --git a/kernel/sched/idle_coop.c b/kernel/sched/idle_coop.c new file mode 100644 index 000000000000..32c18520c122 --- /dev/null +++ b/kernel/sched/idle_coop.c @@ -0,0 +1,128 @@ +#include "sched.h" +#include <linux/cpuidle_scheduled.h> +#include <linux/pm_qos.h> +#include <linux/time.h> +#include <linux/ktime.h> +/* + * Scheduler co-operation with cpuidle + * + */ + +static DEFINE_PER_CPU(struct wakeup_predictor *, core_predictor); + +void setup_scheduled_cpuidle(struct rq *rq) +{ + /* + * This needs revisit, but the initial ideas are: + * 0) The basis of latency requirement is the current value set + * through the pm QoS framework. This includes sources such + * as user space processes that may have information about + * required system responsiveness. + * 1) Cores that don't handle interrups don't care about QoS limits + * set for interrupt latency. (Not handled in this version!) + * Likewise for cores not involved in handling interactive + * processes that may have set QoS request. + * 2) Cores that are involved in unthrottled I/O (e.g. file copy) + * do have improved performance with low latency. However, + * depending on exact hardware and user requirements, the + * improved performance might not be necessary or even productive. + * 3) Throttled I/O, e.g. MP3 playback, does not require low latency, + * unless the hardware audio buffer is really tiny and we can + * safely sleep in deep states. + * + * Latency requirement should reflect all of the above and nothing + * else. In particular, the expected sleep duration or energy + * break-even point of a state should not affect maximum latency + * determination. + * + * Idle duration when there is no I/O should be based on history + * of idle periods, where we try to correlate time to next timer + * expiry to expected sleep duration. We do not try to seek any + * sort of patterns in idle periods when there is no I/O. + * + * Idle duration when there is I/O is likely to be defined by + * the speed of mass media device. Since this may differ by + * several orders of magnitude, we do try to find a pattern and + * use that as well. + * + * The number of processes sleeping on block devices / filesystems + * I/O is quite accurately reported by rq->nr_iowait. + * Other kind of I/O (e.g. audio) is not reflected there, as + * those drivers tend to use functions such as sleep_on(), which + * do not affect nr_iowait. Calls to io_schedule[_timeout]() + * happen from files in block/, drivers/block/, drivers/md, + * fs/ and mm/. + * + * NOTE: All of this really could still be done from menu.c, but + * the intent is to improve this beyond the limits of being + * outside of the scheduler. + */ + int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); + + unsigned int next_timer_us; + unsigned int predicted_us; + unsigned int repeating_us; + struct timespec t; + int nr_iowait = atomic_read(&rq->nr_iowait); + struct sched_pm *pmdata = this_cpu_ptr(&sched_stat); + struct wakeup_predictor *pred = __this_cpu_read(core_predictor); + + if (unlikely(pred == NULL)) { + pred = create_wakeup_predictor(); + __this_cpu_write(core_predictor, pred); + } + + /* Zero latency makes all other considerations obsolete */ + if (unlikely(latency_req == 0)) { + pmdata->idle_time_until_timer = 0; + pmdata->idle_length_estimate = 0; + pmdata->idle_max_latency = 0; + return; + } + + /* + * For now assume all I/O is noncritical unless user space + * did set a QoS restriction. If some sort of heuristics are + * to be added, adjust the latency requirement here. + * + * (We should also be able to distinguish between drivers setting + * latency limits for interrupt handling (and honor those only + * on relevant core(s)) and user space requests. Currently that + * is not possible.) + */ + + /* Determine time to next timer expiry */ + t = ktime_to_timespec(tick_nohz_get_sleep_length()); + next_timer_us = t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC; + + /* Always predict by timer scaling */ + predicted_us = predict_scaled_wakeup(pred, next_timer_us, nr_iowait); + + /* + * In addition, if block I/O is pending, try to predict based on + * recurring events (I/O completion). + */ + if (nr_iowait) { + repeating_us = predict_repeating_wakeup(pred); + if (repeating_us < predicted_us) + predicted_us = repeating_us; + } + + pmdata->idle_time_until_timer = next_timer_us; + pmdata->idle_length_estimate = predicted_us; + pmdata->idle_max_latency = latency_req; +} + +void cpuidle_scheduled_result(struct cpuidle_state *state, + unsigned int duration) +{ + struct wakeup_predictor *pred = __this_cpu_read(core_predictor); + struct sched_pm *pmdata = this_cpu_ptr(&sched_stat); + + /* Pred can be NULL after the initial sleep for non-boot cores */ + if (unlikely(pred == NULL)) + return; + + wakeup_predictor_update(pred, pmdata->idle_time_until_timer, + 0, duration); +} diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d39..e13cea79ebad 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -23,6 +23,7 @@ static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) static void post_schedule_idle(struct rq *rq) { idle_enter_fair(rq); + setup_scheduled_cpuidle(rq); } #endif /* CONFIG_SMP */ /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d5a4ec0db08c..ad241bc13df7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1032,6 +1032,12 @@ static inline void update_packing_domain(int cpu) {}; extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); +#ifdef CONFIG_CPU_IDLE_GOV_SCHEDULED +extern void setup_scheduled_cpuidle(struct rq *rq); +#else +#define setup_scheduled_cpuidle(notused) { do { } while (0); } +#endif + #else /* CONFIG_SMP */ static inline void idle_balance(int cpu, struct rq *rq) |