diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 2016 |
1 files changed, 1960 insertions, 56 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d821f4985de5..87f7b29bb2e0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,6 +32,17 @@ #include <linux/task_work.h> #include <trace/events/sched.h> +#include <linux/sysfs.h> +#include <linux/vmalloc.h> +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Include cpufreq header to add a notifier so that cpu frequency + * scaling can track the current CPU frequency + */ +#include <linux/cpufreq.h> +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#ifdef CONFIG_SCHED_HMP +#include <linux/cpuidle.h> +#endif #include "sched.h" @@ -2289,6 +2300,93 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#ifdef CONFIG_SCHED_HMP +#define HMP_VARIABLE_SCALE_SHIFT 16ULL +struct hmp_global_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *a, struct attribute *b, + const char *c, size_t count); + int *value; + int (*to_sysfs)(int); + int (*from_sysfs)(int); + ssize_t (*to_sysfs_text)(char *buf, int buf_size); +}; + +#define HMP_DATA_SYSFS_MAX 8 + +struct hmp_data_struct { +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + int freqinvar_load_scale_enabled; +#endif + int multiplier; /* used to scale the time delta */ + struct attribute_group attr_group; + struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1]; + struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX]; +} hmp_data; + +static u64 hmp_variable_scale_convert(u64 delta); + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Frequency-Invariant Load Modification: + * Loads are calculated as in PJT's patch however we also scale the current + * contribution in line with the frequency of the CPU that the task was + * executed on. + * In this version, we use a simple linear scale derived from the maximum + * frequency reported by CPUFreq. As an example: + * + * Consider that we ran a task for 100% of the previous interval. + * + * Our CPU was under asynchronous frequency control through one of the + * CPUFreq governors. + * + * The CPUFreq governor reports that it is able to scale the CPU between + * 500MHz and 1GHz. + * + * During the period, the CPU was running at 1GHz. + * + * In this case, our load contribution for that period is calculated as + * 1 * (number_of_active_microseconds) + * + * This results in our task being able to accumulate maximum load as normal. + * + * + * Consider now that our CPU was executing at 500MHz. + * + * We now scale the load contribution such that it is calculated as + * 0.5 * (number_of_active_microseconds) + * + * Our task can only record 50% maximum load during this period. + * + * This represents the task consuming 50% of the CPU's *possible* compute + * capacity. However the task did consume 100% of the CPU's *available* + * compute capacity which is the value seen by the CPUFreq governor and + * user-side CPU Utilization tools. + * + * Restricting tracked load to be scaled by the CPU's frequency accurately + * represents the consumption of possible compute capacity and allows the + * HMP migration's simple threshold migration strategy to interact more + * predictably with CPUFreq's asynchronous compute capacity changes. + */ +#define SCHED_FREQSCALE_SHIFT 10 +struct cpufreq_extents { + u32 curr_scale; + u32 min; + u32 max; + u32 flags; +}; + +/* Flag set when the governor in use only allows one frequency. + * Disables scaling. + */ +#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01 + +static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS]; + +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_SCHED_HMP */ + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2319,13 +2417,24 @@ static u32 __compute_runnable_contrib(u64 n) */ static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, - int runnable) + int runnable, + int running, + int cpu) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + u64 scaled_delta; + u32 scaled_runnable_contrib; + int scaled_delta_w; + u32 curr_scale = 1024; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ delta = now - sa->last_runnable_update; +#ifdef CONFIG_SCHED_HMP + delta = hmp_variable_scale_convert(delta); +#endif /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. @@ -2344,6 +2453,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return 0; sa->last_runnable_update = now; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* retrieve scale factor for load */ + if (hmp_data.freqinvar_load_scale_enabled) + curr_scale = freq_scale[cpu].curr_scale; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + /* delta_w is the amount already accumulated against our next period */ delta_w = sa->runnable_avg_period % 1024; if (delta + delta_w >= 1024) { @@ -2356,8 +2471,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, * period and accrue it. */ delta_w = 1024 - delta_w; + /* scale runnable time if necessary */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_delta_w = (delta_w * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_delta_w; + if (running) + sa->usage_avg_sum += scaled_delta_w; +#else if (runnable) sa->runnable_avg_sum += delta_w; + if (running) + sa->usage_avg_sum += delta_w; +#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += delta_w; delta -= delta_w; @@ -2365,22 +2492,51 @@ static __always_inline int __update_entity_runnable_avg(u64 now, /* Figure out how many additional periods this update spans */ periods = delta / 1024; delta %= 1024; + /* decay the load we have accumulated so far */ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); sa->runnable_avg_period = decay_load(sa->runnable_avg_period, periods + 1); + sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); + /* add the contribution from this period */ /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); + /* Apply load scaling if necessary. + * Note that multiplying the whole series is same as + * multiplying all terms + */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_runnable_contrib = (runnable_contrib * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_runnable_contrib; + if (running) + sa->usage_avg_sum += scaled_runnable_contrib; +#else if (runnable) sa->runnable_avg_sum += runnable_contrib; + if (running) + sa->usage_avg_sum += runnable_contrib; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ + /* scale if necessary */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT); + if (runnable) + sa->runnable_avg_sum += scaled_delta; + if (running) + sa->usage_avg_sum += scaled_delta; +#else if (runnable) sa->runnable_avg_sum += delta; + if (running) + sa->usage_avg_sum += delta; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += delta; return decayed; @@ -2393,10 +2549,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; - if (!decays) - return 0; - - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + if (decays) + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); se->avg.decay_count = 0; return decays; @@ -2429,16 +2583,28 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; - long contrib; + long contrib, usage_contrib; /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { + usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + usage_contrib -= cfs_rq->tg_usage_contrib; + + /* + * contrib/usage at this point represent deltas, only update if they + * are substantive. + */ + if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) || + (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) { atomic_add(contrib, &tg->runnable_avg); cfs_rq->tg_runnable_contrib += contrib; + + atomic_add(usage_contrib, &tg->usage_avg); + cfs_rq->tg_usage_contrib += usage_contrib; } } @@ -2486,8 +2652,17 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + int cpu = -1; /* not used in normal case */ + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + cpu = rq->cpu; +#endif + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable, + runnable, cpu); __update_tg_runnable_avg(&rq->avg, &rq->cfs); + trace_sched_rq_runnable_ratio(cpu_of(rq), rq->avg.load_avg_ratio); + trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg); + trace_sched_rq_nr_running(cpu_of(rq), rq->nr_running, rq->nr_iowait.counter); } #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, @@ -2506,12 +2681,18 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); contrib /= (se->avg.runnable_avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); + trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib); + contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_ratio = scale_load(contrib); + trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio); } /* Compute the current contribution to load_avg by se, return any delta */ -static long __update_entity_load_avg_contrib(struct sched_entity *se) +static long __update_entity_load_avg_contrib(struct sched_entity *se, long *ratio) { long old_contrib = se->avg.load_avg_contrib; + long old_ratio = se->avg.load_avg_ratio; if (entity_is_task(se)) { __update_task_entity_contrib(se); @@ -2520,6 +2701,8 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) __update_group_entity_contrib(se); } + if (ratio) + *ratio = se->avg.load_avg_ratio - old_ratio; return se->avg.load_avg_contrib - old_contrib; } @@ -2539,9 +2722,13 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, ratio_delta; u64 now; + int cpu = -1; /* not used in normal case */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + cpu = cfs_rq->rq->cpu; +#endif /* * For a group entity we need to use their owned cfs_rq_clock_task() in * case they are the parent of a throttled hierarchy. @@ -2551,18 +2738,23 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se, cpu)) return; - contrib_delta = __update_entity_load_avg_contrib(se); + contrib_delta = __update_entity_load_avg_contrib(se, &ratio_delta); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else +#ifdef CONFIG_SCHED_HMP + rq_of(cfs_rq)->avg.load_avg_ratio += ratio_delta; +#endif /* CONFIG_SCHED_HMP */ + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -2637,6 +2829,10 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; +#ifdef CONFIG_SCHED_HMP + rq_of(cfs_rq)->avg.load_avg_ratio += se->avg.load_avg_ratio; +#endif /* CONFIG_SCHED_HMP */ + /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -2655,6 +2851,10 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; +#ifdef CONFIG_SCHED_HMP + rq_of(cfs_rq)->avg.load_avg_ratio -= se->avg.load_avg_ratio; +#endif /* CONFIG_SCHED_HMP */ + if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -2993,6 +3193,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -4529,6 +4730,844 @@ done: return target; } +#ifdef CONFIG_SCHED_HMP +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * The cpu types are distinguished using a list of hmp_domains + * which each represent one cpu type using a cpumask. + * The list is assumed ordered by compute capacity with the + * fastest domain first. + */ +DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain); +static const int hmp_max_tasks = 5; + +extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list); + +#ifdef CONFIG_CPU_IDLE +/* + * hmp_idle_pull: + * + * In this version we have stopped using forced up migrations when we + * detect that a task running on a little CPU should be moved to a bigger + * CPU. In most cases, the bigger CPU is in a deep sleep state and a forced + * migration means we stop the task immediately but need to wait for the + * target CPU to wake up before we can restart the task which is being + * moved. Instead, we now wake a big CPU with an IPI and ask it to pull + * a task when ready. This allows the task to continue executing on its + * current CPU, reducing the amount of time that the task is stalled for. + * + * keepalive timers: + * + * The keepalive timer is used as a way to keep a CPU engaged in an + * idle pull operation out of idle while waiting for the source + * CPU to stop and move the task. Ideally this would not be necessary + * and we could impose a temporary zero-latency requirement on the + * current CPU, but in the current QoS framework this will result in + * all CPUs in the system being unable to enter idle states which is + * not desirable. The timer does not perform any work when it expires. + */ +struct hmp_keepalive { + bool init; + ktime_t delay; /* if zero, no need for timer */ + struct hrtimer timer; +}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive); + +/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +/* + * Work out if any of the idle states have an exit latency too high for us. + * ns_delay is passed in containing the max we are willing to tolerate. + * If there are none, set ns_delay to zero. + * If there are any, set ns_delay to + * ('target_residency of state with shortest too-big latency' - 1) * 1000. + */ +static void hmp_keepalive_delay(int cpu, unsigned int *ns_delay) +{ + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); + struct cpuidle_driver *drv; + + drv = cpuidle_get_cpu_driver(dev); + if (drv) { + unsigned int us_delay = UINT_MAX; + unsigned int us_max_delay = *ns_delay / 1000; + int idx; + /* if cpuidle states are guaranteed to be sorted we + * could stop at the first match. + */ + for (idx = 0; idx < drv->state_count; idx++) { + if (drv->states[idx].exit_latency > us_max_delay && + drv->states[idx].target_residency < us_delay) { + us_delay = drv->states[idx].target_residency; + } + } + if (us_delay == UINT_MAX) + *ns_delay = 0; /* no timer required */ + else + *ns_delay = 1000 * (us_delay - 1); + } +} + +static void hmp_cpu_keepalive_trigger(void) +{ + int cpu = smp_processor_id(); + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (!keepalive->init) { + unsigned int ns_delay = 100000; /* tolerate 100usec delay */ + + hrtimer_init(&keepalive->timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + keepalive->timer.function = hmp_cpu_keepalive_notify; + + hmp_keepalive_delay(cpu, &ns_delay); + keepalive->delay = ns_to_ktime(ns_delay); + keepalive->init = true; + } + if (ktime_to_ns(keepalive->delay)) + __hrtimer_start_range_ns(&keepalive->timer, + keepalive->delay, 0, HRTIMER_MODE_REL_PINNED, 0); +} + +static void hmp_cpu_keepalive_cancel(int cpu) +{ + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (keepalive->init) + hrtimer_cancel(&keepalive->timer); +} +#else /* !CONFIG_CPU_IDLE */ +static void hmp_cpu_keepalive_trigger(void) +{ +} + +static void hmp_cpu_keepalive_cancel(int cpu) +{ +} +#endif + +/* Setup hmp_domains */ +static int __init hmp_cpu_mask_setup(void) +{ + char buf[64]; + struct hmp_domain *domain; + struct list_head *pos; + int dc, cpu; + + pr_debug("Initializing HMP scheduler:\n"); + + /* Initialize hmp_domains using platform code */ + arch_get_hmp_domains(&hmp_domains); + if (list_empty(&hmp_domains)) { + pr_debug("HMP domain list is empty!\n"); + return 0; + } + + /* Print hmp_domains */ + dc = 0; + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + cpulist_scnprintf(buf, 64, &domain->possible_cpus); + pr_debug(" HMP domain %d: %s\n", dc, buf); + + for_each_cpu_mask(cpu, domain->possible_cpus) { + per_cpu(hmp_cpu_domain, cpu) = domain; + } + dc++; + } + + return 1; +} + +static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu) +{ + struct hmp_domain *domain; + struct list_head *pos; + + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + if(cpumask_test_cpu(cpu, &domain->possible_cpus)) + return domain; + } + return NULL; +} + +static void hmp_online_cpu(int cpu) +{ + struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + + if(domain) + cpumask_set_cpu(cpu, &domain->cpus); +} + +static void hmp_offline_cpu(int cpu) +{ + struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + + if(domain) + cpumask_clear_cpu(cpu, &domain->cpus); + + hmp_cpu_keepalive_cancel(cpu); +} + +/* + * Needed to determine heaviest tasks etc. + */ +static inline unsigned int hmp_cpu_is_fastest(int cpu); +static inline unsigned int hmp_cpu_is_slowest(int cpu); +static inline struct hmp_domain *hmp_slower_domain(int cpu); +static inline struct hmp_domain *hmp_faster_domain(int cpu); + +/* must hold runqueue lock for queue se is currently on */ +static struct sched_entity *hmp_get_heaviest_task( + struct sched_entity *se, int target_cpu) +{ + int num_tasks = hmp_max_tasks; + struct sched_entity *max_se = se; + unsigned long int max_ratio = se->avg.load_avg_ratio; + const struct cpumask *hmp_target_mask = NULL; + struct hmp_domain *hmp; + + if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) + return max_se; + + hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); + hmp_target_mask = &hmp->cpus; + if (target_cpu >= 0) { + /* idle_balance gets run on a CPU while + * it is in the middle of being hotplugged + * out. Bail early in that case. + */ + if(!cpumask_test_cpu(target_cpu, hmp_target_mask)) + return NULL; + hmp_target_mask = cpumask_of(target_cpu); + } + /* The currently running task is not on the runqueue */ + se = __pick_first_entity(cfs_rq_of(se)); + + while (num_tasks && se) { + if (entity_is_task(se) && + se->avg.load_avg_ratio > max_ratio && + cpumask_intersects(hmp_target_mask, + tsk_cpus_allowed(task_of(se)))) { + max_se = se; + max_ratio = se->avg.load_avg_ratio; + } + se = __pick_next_entity(se); + num_tasks--; + } + return max_se; +} + +static struct sched_entity *hmp_get_lightest_task( + struct sched_entity *se, int migrate_down) +{ + int num_tasks = hmp_max_tasks; + struct sched_entity *min_se = se; + unsigned long int min_ratio = se->avg.load_avg_ratio; + const struct cpumask *hmp_target_mask = NULL; + + if (migrate_down) { + struct hmp_domain *hmp; + if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq))) + return min_se; + hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq)); + hmp_target_mask = &hmp->cpus; + } + /* The currently running task is not on the runqueue */ + se = __pick_first_entity(cfs_rq_of(se)); + + while (num_tasks && se) { + if (entity_is_task(se) && + (se->avg.load_avg_ratio < min_ratio && + hmp_target_mask && + cpumask_intersects(hmp_target_mask, + tsk_cpus_allowed(task_of(se))))) { + min_se = se; + min_ratio = se->avg.load_avg_ratio; + } + se = __pick_next_entity(se); + num_tasks--; + } + return min_se; +} + +/* + * Migration thresholds should be in the range [0..1023] + * hmp_up_threshold: min. load required for migrating tasks to a faster cpu + * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu + * + * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio) + * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms) + * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms) + * + * Small Task Packing: + * We can choose to fill the littlest CPUs in an HMP system rather than + * the typical spreading mechanic. This behavior is controllable using + * two variables. + * hmp_packing_enabled: runtime control over pack/spread + * hmp_full_threshold: Consider a CPU with this much unweighted load full + */ +unsigned int hmp_up_threshold = 700; +unsigned int hmp_down_threshold = 512; +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER +unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL); +#endif +unsigned int hmp_next_up_threshold = 4096; +unsigned int hmp_next_down_threshold = 4096; + +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING +/* + * Set the default packing threshold to try to keep little + * CPUs at no more than 80% of their maximum frequency if only + * packing a small number of small tasks. Bigger tasks will + * raise frequency as normal. + * In order to pack a task onto a CPU, the sum of the + * unweighted runnable_avg load of existing tasks plus the + * load of the new task must be less than hmp_full_threshold. + * + * This works in conjunction with frequency-invariant load + * and DVFS governors. Since most DVFS governors aim for 80% + * utilisation, we arrive at (0.8*0.8*(max_load=1024))=655 + * and use a value slightly lower to give a little headroom + * in the decision. + * Note that the most efficient frequency is different for + * each system so /sys/kernel/hmp/packing_limit should be + * configured at runtime for any given platform to achieve + * optimal energy usage. Some systems may not benefit from + * packing, so this feature can also be disabled at runtime + * with /sys/kernel/hmp/packing_enable + */ +unsigned int hmp_packing_enabled = 1; +unsigned int hmp_full_threshold = 650; +#endif + +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se); +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu, struct cpumask *affinity); + +static inline struct hmp_domain *hmp_smallest_domain(void) +{ + return list_entry(hmp_domains.prev, struct hmp_domain, hmp_domains); +} + +/* Check if cpu is in fastest hmp_domain */ +static inline unsigned int hmp_cpu_is_fastest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return pos == hmp_domains.next; +} + +/* Check if cpu is in slowest hmp_domain */ +static inline unsigned int hmp_cpu_is_slowest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_is_last(pos, &hmp_domains); +} + +/* Next (slower) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_slower_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->next, struct hmp_domain, hmp_domains); +} + +/* Previous (faster) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_faster_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->prev, struct hmp_domain, hmp_domains); +} + +/* + * Selects a cpu in previous (faster) hmp_domain + */ +static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk, + int cpu) +{ + int lowest_cpu=NR_CPUS; + __always_unused int lowest_ratio; + struct hmp_domain *hmp; + + if (hmp_cpu_is_fastest(cpu)) + hmp = hmp_cpu_domain(cpu); + else + hmp = hmp_faster_domain(cpu); + + lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu, + tsk_cpus_allowed(tsk)); + + return lowest_cpu; +} + +/* + * Selects a cpu in next (slower) hmp_domain + * Note that cpumask_any_and() returns the first cpu in the cpumask + */ +static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk, + int cpu) +{ + int lowest_cpu=NR_CPUS; + struct hmp_domain *hmp; + __always_unused int lowest_ratio; + + if (hmp_cpu_is_slowest(cpu)) + hmp = hmp_cpu_domain(cpu); + else + hmp = hmp_slower_domain(cpu); + + lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu, + tsk_cpus_allowed(tsk)); + + return lowest_cpu; +} + +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING +/* + * Select the 'best' candidate little CPU to wake up on. + * Implements a packing strategy which examines CPU in + * logical CPU order, and selects the first which will + * be loaded less than hmp_full_threshold according to + * the sum of the tracked load of the runqueue and the task. + */ +static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk, + int cpu) { + int tmp_cpu; + unsigned long estimated_load; + struct hmp_domain *hmp; + struct sched_avg *avg; + struct cpumask allowed_hmp_cpus; + + if(!hmp_packing_enabled || + tsk->se.avg.load_avg_ratio > ((NICE_0_LOAD * 90)/100)) + return hmp_select_slower_cpu(tsk, cpu); + + if (hmp_cpu_is_slowest(cpu)) + hmp = hmp_cpu_domain(cpu); + else + hmp = hmp_slower_domain(cpu); + + /* respect affinity */ + cpumask_and(&allowed_hmp_cpus, &hmp->cpus, + tsk_cpus_allowed(tsk)); + + for_each_cpu_mask(tmp_cpu, allowed_hmp_cpus) { + avg = &cpu_rq(tmp_cpu)->avg; + /* estimate new rq load if we add this task */ + estimated_load = avg->load_avg_ratio + + tsk->se.avg.load_avg_ratio; + if (estimated_load <= hmp_full_threshold) { + cpu = tmp_cpu; + break; + } + } + /* if no match was found, the task uses the initial value */ + return cpu; +} +#endif + +static inline void hmp_next_up_delay(struct sched_entity *se, int cpu) +{ + /* hack - always use clock from first online CPU */ + u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task; + se->avg.hmp_last_up_migration = now; + se->avg.hmp_last_down_migration = 0; + cpu_rq(cpu)->avg.hmp_last_up_migration = now; + cpu_rq(cpu)->avg.hmp_last_down_migration = 0; +} + +static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) +{ + /* hack - always use clock from first online CPU */ + u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task; + se->avg.hmp_last_down_migration = now; + se->avg.hmp_last_up_migration = 0; + cpu_rq(cpu)->avg.hmp_last_down_migration = now; + cpu_rq(cpu)->avg.hmp_last_up_migration = 0; +} + +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * These functions allow to change the growing speed of the load_avg_ratio + * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms + * This can now be changed with /sys/kernel/hmp/load_avg_period_ms. + * + * These functions also allow to change the up and down threshold of HMP + * using /sys/kernel/hmp/{up,down}_threshold. + * Both must be between 0 and 1023. The threshold that is compared + * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024. + * + * For instance, if load_avg_period = 64 and up_threshold = 512, an idle + * task with a load of 0 will reach the threshold after 64ms of busy loop. + * + * Changing load_avg_periods_ms has the same effect than changing the + * default scaling factor Y=1002/1024 in the load_avg_ratio computation to + * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one + * could trigger overflows. + * For instance, with Y = 1023/1024 in __update_task_entity_contrib() + * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);" + * could be overflowed for a weight > 2^12 even is the load_avg_contrib + * should still be a 32bits result. This would not happen by multiplicating + * delta time by 1/22 and setting load_avg_period_ms = 706. + */ + +/* + * By scaling the delta time it end-up increasing or decrease the + * growing speed of the per entity load_avg_ratio + * The scale factor hmp_data.multiplier is a fixed point + * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT + */ +static inline u64 hmp_variable_scale_convert(u64 delta) +{ +#ifdef CONFIG_HMP_VARIABLE_SCALE + u64 high = delta >> 32ULL; + u64 low = delta & 0xffffffffULL; + low *= hmp_data.multiplier; + high *= hmp_data.multiplier; + return (low >> HMP_VARIABLE_SCALE_SHIFT) + + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT)); +#else + return delta; +#endif +} + +static ssize_t hmp_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + int temp; + + if (hmp_attr->to_sysfs_text != NULL) + return hmp_attr->to_sysfs_text(buf, PAGE_SIZE); + + temp = *(hmp_attr->value); + if (hmp_attr->to_sysfs != NULL) + temp = hmp_attr->to_sysfs(temp); + + return (ssize_t)sprintf(buf, "%d\n", temp); +} + +static ssize_t hmp_store(struct kobject *a, struct attribute *attr, + const char *buf, size_t count) +{ + int temp; + ssize_t ret = count; + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + char *str = vmalloc(count + 1); + if (str == NULL) + return -ENOMEM; + memcpy(str, buf, count); + str[count] = 0; + if (sscanf(str, "%d", &temp) < 1) + ret = -EINVAL; + else { + if (hmp_attr->from_sysfs != NULL) + temp = hmp_attr->from_sysfs(temp); + if (temp < 0) + ret = -EINVAL; + else + *(hmp_attr->value) = temp; + } + vfree(str); + return ret; +} + +static ssize_t hmp_print_domains(char *outbuf, int outbufsize) +{ + char buf[64]; + const char nospace[] = "%s", space[] = " %s"; + const char *fmt = nospace; + struct hmp_domain *domain; + struct list_head *pos; + int outpos = 0; + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + if (cpumask_scnprintf(buf, 64, &domain->possible_cpus)) { + outpos += sprintf(outbuf+outpos, fmt, buf); + fmt = space; + } + } + strcat(outbuf, "\n"); + return outpos+1; +} + +#ifdef CONFIG_HMP_VARIABLE_SCALE +static int hmp_period_tofrom_sysfs(int value) +{ + return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value; +} +#endif + +/* max value for threshold is 1024 */ +static int hmp_theshold_from_sysfs(int value) +{ + if (value > 1024) + return -1; + return value; +} + +#if defined(CONFIG_SCHED_HMP_LITTLE_PACKING) || \ + defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) +/* toggle control is only 0,1 off/on */ +static int hmp_toggle_from_sysfs(int value) +{ + if (value < 0 || value > 1) + return -1; + return value; +} +#endif + +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING +/* packing value must be non-negative */ +static int hmp_packing_from_sysfs(int value) +{ + if (value < 0) + return -1; + return value; +} +#endif + +static void hmp_attr_add( + const char *name, + int *value, + int (*to_sysfs)(int), + int (*from_sysfs)(int), + ssize_t (*to_sysfs_text)(char *, int), + umode_t mode) +{ + int i = 0; + while (hmp_data.attributes[i] != NULL) { + i++; + if (i >= HMP_DATA_SYSFS_MAX) + return; + } + if (mode) + hmp_data.attr[i].attr.mode = mode; + else + hmp_data.attr[i].attr.mode = 0644; + hmp_data.attr[i].show = hmp_show; + hmp_data.attr[i].store = hmp_store; + hmp_data.attr[i].attr.name = name; + hmp_data.attr[i].value = value; + hmp_data.attr[i].to_sysfs = to_sysfs; + hmp_data.attr[i].from_sysfs = from_sysfs; + hmp_data.attr[i].to_sysfs_text = to_sysfs_text; + hmp_data.attributes[i] = &hmp_data.attr[i].attr; + hmp_data.attributes[i + 1] = NULL; +} + +static int hmp_attr_init(void) +{ + int ret; + memset(&hmp_data, 0, sizeof(hmp_data)); + hmp_attr_add("hmp_domains", + NULL, + NULL, + NULL, + hmp_print_domains, + 0444); + hmp_attr_add("up_threshold", + &hmp_up_threshold, + NULL, + hmp_theshold_from_sysfs, + NULL, + 0); + hmp_attr_add("down_threshold", + &hmp_down_threshold, + NULL, + hmp_theshold_from_sysfs, + NULL, + 0); +#ifdef CONFIG_HMP_VARIABLE_SCALE + /* by default load_avg_period_ms == LOAD_AVG_PERIOD + * meaning no change + */ + hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD); + hmp_attr_add("load_avg_period_ms", + &hmp_data.multiplier, + hmp_period_tofrom_sysfs, + hmp_period_tofrom_sysfs, + NULL, + 0); +#endif +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* default frequency-invariant scaling ON */ + hmp_data.freqinvar_load_scale_enabled = 1; + hmp_attr_add("frequency_invariant_load_scale", + &hmp_data.freqinvar_load_scale_enabled, + NULL, + hmp_toggle_from_sysfs, + NULL, + 0); +#endif +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING + hmp_attr_add("packing_enable", + &hmp_packing_enabled, + NULL, + hmp_toggle_from_sysfs, + NULL, + 0); + hmp_attr_add("packing_limit", + &hmp_full_threshold, + NULL, + hmp_packing_from_sysfs, + NULL, + 0); +#endif + hmp_data.attr_group.name = "hmp"; + hmp_data.attr_group.attrs = hmp_data.attributes; + ret = sysfs_create_group(kernel_kobj, + &hmp_data.attr_group); + return 0; +} +late_initcall(hmp_attr_init); + +/* + * return the load of the lowest-loaded CPU in a given HMP domain + * min_cpu optionally points to an int to receive the CPU. + * affinity optionally points to a cpumask containing the + * CPUs to be considered. note: + * + min_cpu = NR_CPUS only if no CPUs are in the set of + * affinity && hmp_domain cpus + * + min_cpu will always otherwise equal one of the CPUs in + * the hmp domain + * + when more than one CPU has the same load, the one which + * is least-recently-disturbed by an HMP migration will be + * selected + * + if all CPUs are equally loaded or idle and the times are + * all the same, the first in the set will be used + * + if affinity is not set, cpu_online_mask is used + */ +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu, struct cpumask *affinity) +{ + int cpu; + int min_cpu_runnable_temp = NR_CPUS; + u64 min_target_last_migration = ULLONG_MAX; + u64 curr_last_migration; + unsigned long min_runnable_load = INT_MAX; + unsigned long contrib; + struct sched_avg *avg; + struct cpumask temp_cpumask; + /* + * only look at CPUs allowed if specified, + * otherwise look at all online CPUs in the + * right HMP domain + */ + cpumask_and(&temp_cpumask, &hmpd->cpus, affinity ? affinity : cpu_online_mask); + + for_each_cpu_mask(cpu, temp_cpumask) { + avg = &cpu_rq(cpu)->avg; + /* used for both up and down migration */ + curr_last_migration = avg->hmp_last_up_migration ? + avg->hmp_last_up_migration : avg->hmp_last_down_migration; + + contrib = avg->load_avg_ratio; + /* + * Consider a runqueue completely busy if there is any load + * on it. Definitely not the best for overall fairness, but + * does well in typical Android use cases. + */ + if (contrib) + contrib = 1023; + + if ((contrib < min_runnable_load) || + (contrib == min_runnable_load && + curr_last_migration < min_target_last_migration)) { + /* + * if the load is the same target the CPU with + * the longest time since a migration. + * This is to spread migration load between + * members of a domain more evenly when the + * domain is fully loaded + */ + min_runnable_load = contrib; + min_cpu_runnable_temp = cpu; + min_target_last_migration = curr_last_migration; + } + } + + if (min_cpu) + *min_cpu = min_cpu_runnable_temp; + + return min_runnable_load; +} + +/* + * Calculate the task starvation + * This is the ratio of actually running time vs. runnable time. + * If the two are equal the task is getting the cpu time it needs or + * it is alone on the cpu and the cpu is fully utilized. + */ +static inline unsigned int hmp_task_starvation(struct sched_entity *se) +{ + u32 starvation; + + starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD); + starvation /= (se->avg.runnable_avg_sum + 1); + + return scale_load(starvation); +} + +static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se) +{ + int min_usage; + int dest_cpu = NR_CPUS; + + if (hmp_cpu_is_slowest(cpu)) + return NR_CPUS; + + /* Is there an idle CPU in the current domain */ + min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL); + if (min_usage == 0) { + trace_sched_hmp_offload_abort(cpu, min_usage, "load"); + return NR_CPUS; + } + + /* Is the task alone on the cpu? */ + if (cpu_rq(cpu)->cfs.h_nr_running < 2) { + trace_sched_hmp_offload_abort(cpu, + cpu_rq(cpu)->cfs.h_nr_running, "nr_running"); + return NR_CPUS; + } + + /* Is the task actually starving? */ + /* >=25% ratio running/runnable = starving */ + if (hmp_task_starvation(se) > 768) { + trace_sched_hmp_offload_abort(cpu, hmp_task_starvation(se), + "starvation"); + return NR_CPUS; + } + + /* Does the slower domain have any idle CPUs? */ + min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu, + tsk_cpus_allowed(task_of(se))); + + if (min_usage == 0) { + trace_sched_hmp_offload_succeed(cpu, dest_cpu); + return dest_cpu; + } else + trace_sched_hmp_offload_abort(cpu,min_usage,"slowdomain"); + + return NR_CPUS; +} +#endif /* CONFIG_SCHED_HMP */ + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -4553,6 +5592,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (p->nr_cpus_allowed == 1) return prev_cpu; +#ifdef CONFIG_SCHED_HMP + /* always put non-kernel forking tasks on a big domain */ + if (unlikely(sd_flag & SD_BALANCE_FORK) && hmp_task_should_forkboost(p)) { + new_cpu = hmp_select_faster_cpu(p, prev_cpu); + if (new_cpu != NR_CPUS) { + hmp_next_up_delay(&p->se, new_cpu); + return new_cpu; + } + /* failed to perform HMP fork balance, use normal balance */ + new_cpu = cpu; + } +#endif + if (sd_flag & SD_BALANCE_WAKE) want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); @@ -4620,9 +5672,49 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock(); +#ifdef CONFIG_SCHED_HMP + prev_cpu = task_cpu(p); + + if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) { + hmp_next_up_delay(&p->se, new_cpu); + trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP); + return new_cpu; + } + if (hmp_down_migration(prev_cpu, &p->se)) { +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING + new_cpu = hmp_best_little_cpu(p, prev_cpu); +#else + new_cpu = hmp_select_slower_cpu(p, prev_cpu); +#endif + /* + * we might have no suitable CPU + * in which case new_cpu == NR_CPUS + */ + if (new_cpu < NR_CPUS && new_cpu != prev_cpu) { + hmp_next_down_delay(&p->se, new_cpu); + trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP); + return new_cpu; + } + } + /* Make sure that the task stays in its previous hmp domain */ + if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus)) + return prev_cpu; +#endif + return new_cpu; } +#ifdef CONFIG_SCHED_HMP +#ifdef CONFIG_NO_HZ_COMMON +static int nohz_test_cpu(int cpu); +#else +static inline int nohz_test_cpu(int cpu) +{ + return 0; +} +#endif +#endif + /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -4642,6 +5734,27 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) * be negative here since on-rq tasks have decay-count == 0. */ if (se->avg.decay_count) { +#ifdef CONFIG_SCHED_HMP + /* + * If we migrate a sleeping task away from a CPU + * which has the tick stopped, then both the clock_task + * and decay_counter will be out of date for that CPU + * and we will not decay load correctly. + */ + if (!se->on_rq && nohz_test_cpu(task_cpu(p))) { + struct rq *rq = cpu_rq(task_cpu(p)); + unsigned long flags; + /* + * Current CPU cannot be holding rq->lock in this + * circumstance, but another might be. We must hold + * rq->lock before we go poking around in its clocks + */ + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_blocked_load(cfs_rq, 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +#endif /* CONFIG_SCHED_HMP */ se->avg.decay_count = -__synchronize_entity_decay(se); atomic_long_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); @@ -6924,6 +8037,10 @@ update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_b *next_balance = next; } +#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_idle_pull(int this_cpu); +#endif + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -6998,7 +8115,10 @@ static int idle_balance(struct rq *this_rq) break; } rcu_read_unlock(); - +#ifdef CONFIG_SCHED_HMP + if (!pulled_task) + pulled_task = hmp_idle_pull(this_cpu); +#endif raw_spin_lock(&this_rq->lock); if (curr_cost > this_rq->max_idle_balance_cost) @@ -7029,24 +8149,128 @@ out: return pulled_task; } -/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes - * running tasks off the busiest CPU onto idle CPUs. It requires at - * least 1 task to be running on each physical CPU where possible, and - * avoids physical / logical imbalances. - */ -static int active_load_balance_cpu_stop(void *data) +static int active_load_balance_fair(void *data) +{ + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct task_struct *tsk = NULL; + struct lb_env env = { + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, env.sd) + if ((env.sd->flags & SD_LOAD_BALANCE) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(env.sd))) + break; + + /* Task could not be migrated between these two CPUs */ + if (unlikely(env.sd == NULL)) + goto out_done; + + /* FAIR active balance */ + schedstat_inc(env.sd, alb_count); + + tsk = detach_one_task(&env); + if (tsk) + schedstat_inc(env.sd, alb_pushed); + else + schedstat_inc(env.sd, alb_failed); + +out_done: + + rcu_read_unlock(); + + busiest_rq->active_balance = 0; + raw_spin_unlock(&busiest_rq->lock); + + if (tsk) + attach_one_task(target_rq, tsk); + + local_irq_enable(); + + return 0; +} + +#ifdef CONFIG_SCHED_HMP +static int move_specific_task(struct lb_env *env, struct task_struct *pm); +static int active_load_balance_hmp(void *data) +{ + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct task_struct *tsk = busiest_rq->migrate_task; + struct lb_env env = { + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, env.sd) + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(env.sd))) + break; + + /* Task could not be migrated between these two CPUs */ + if (unlikely(env.sd == NULL)) + goto out_done; + + schedstat_inc(env.sd, alb_count); + + if (move_specific_task(&env, tsk)) + schedstat_inc(env.sd, alb_pushed); + else + schedstat_inc(env.sd, alb_failed); + +out_done: + + rcu_read_unlock(); + double_unlock_balance(busiest_rq, target_rq); + + busiest_rq->active_balance = 0; + raw_spin_unlock(&busiest_rq->lock); + + /* + * Reset active balance request and release the task + * + * NOTE: The put_task_struct() is done while the runqueue spinlock is + * held, but put_task_struct() can also cause a reschedule causing the + * runqueue lock to be acquired recursively. + * To avoid this race condition, we keep the put_task_struct() outside + * the runqueue spinlock. + */ + put_task_struct(tsk); + + local_irq_enable(); + + return 0; +} +#endif + +static int __do_active_load_balance_cpu_stop(void *data, bool hmp_active_balance) { struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - struct task_struct *p = NULL; raw_spin_lock_irq(&busiest_rq->lock); - /* make sure the requested cpu hasn't gone down in the meantime */ + /* Make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) goto out_unlock; @@ -7062,43 +8286,41 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; +#ifdef CONFIG_SCHED_HMP + /* HMP active balance (using double locking) */ + if (hmp_active_balance) + return active_load_balance_hmp(data); +#endif - schedstat_inc(sd, alb_count); + /* FAIR active balance (using attach/detach) */ + return active_load_balance_fair(data); - p = detach_one_task(&env); - if (p) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock(&busiest_rq->lock); - if (p) - attach_one_task(target_rq, p); +#ifdef CONFIG_SCHED_HMP + if (hmp_active_balance) + put_task_struct(busiest_rq->migrate_task); +#endif local_irq_enable(); return 0; + +} + +/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ + return __do_active_load_balance_cpu_stop(data, false); } static inline int on_null_domain(struct rq *rq) @@ -7119,12 +8341,79 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -static inline int find_new_ilb(void) +#ifdef CONFIG_SCHED_HMP +static int nohz_test_cpu(int cpu) +{ + return cpumask_test_cpu(cpu, nohz.idle_cpus_mask); +} +#endif + +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING +/* + * Decide if the tasks on the busy CPUs in the littlest domain would benefit + * from an idle balance + * + * When packing is enabled, only enforce this behaviour when we are not in the + * smallest domain - there we idle balance whenever a CPU is over the + * up_threshold regardless of tasks in case one needs to be moved. + */ +static int hmp_packing_ilb_needed(int cpu, int ilb_needed) +{ + struct hmp_domain *hmp; + /* allow previous decision on non-slowest domain */ + if (!hmp_cpu_is_slowest(cpu)) + return ilb_needed; + + /* if disabled, use normal ILB behaviour */ + if (!hmp_packing_enabled) + return ilb_needed; + + hmp = hmp_cpu_domain(cpu); + for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) { + /* only idle balance if a CPU is loaded over threshold */ + if (cpu_rq(cpu)->avg.load_avg_ratio > hmp_full_threshold) + return 1; + } + return 0; +} +#endif + +DEFINE_PER_CPU(cpumask_var_t, ilb_tmpmask); + +static inline int find_new_ilb(int call_cpu) { int ilb = cpumask_first(nohz.idle_cpus_mask); +#ifdef CONFIG_SCHED_HMP + int ilb_needed = 0; + int cpu; + struct cpumask* tmp = per_cpu(ilb_tmpmask, smp_processor_id()); + + /* restrict nohz balancing to occur in the same hmp domain */ + ilb = cpumask_first_and(nohz.idle_cpus_mask, + &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus); + + /* check to see if it's necessary within this domain */ + cpumask_andnot(tmp, + &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus, + nohz.idle_cpus_mask); + for_each_cpu(cpu, tmp) { + if (cpu_rq(cpu)->nr_running > 1) { + ilb_needed = 1; + break; + } + } +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING + if (ilb < nr_cpu_ids) + ilb_needed = hmp_packing_ilb_needed(ilb, ilb_needed); +#endif + + if (ilb_needed && ilb < nr_cpu_ids && idle_cpu(ilb)) + return ilb; +#else if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; +#endif return nr_cpu_ids; } @@ -7134,13 +8423,13 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void nohz_balancer_kick(int cpu) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(); + ilb_cpu = find_new_ilb(cpu); if (ilb_cpu >= nr_cpu_ids) return; @@ -7435,6 +8724,18 @@ static inline int nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return 0; +#ifdef CONFIG_SCHED_HMP + /* + * Bail out if there are no nohz CPUs in our + * HMP domain, since we will move tasks between + * domains through wakeup and force balancing + * as necessary based upon task load. + */ + if (cpumask_first_and(nohz.idle_cpus_mask, + &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids) + return 0; +#endif + if (rq->nr_running >= 2) goto need_kick; @@ -7467,6 +8768,456 @@ need_kick: static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } #endif +#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_task_eligible_for_up_migration(struct sched_entity *se) +{ + /* below hmp_up_threshold, never eligible */ + if (se->avg.load_avg_ratio < hmp_up_threshold) + return 0; + return 1; +} + +/* Check if task should migrate to a faster cpu */ +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + int temp_target_cpu; + u64 now; + + if (hmp_cpu_is_fastest(cpu)) + return 0; + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if (p->prio >= hmp_up_prio) + return 0; +#endif + if (!hmp_task_eligible_for_up_migration(se)) + return 0; + + /* Let the task load settle before doing another up migration */ + /* hack - always use clock from first online CPU */ + now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task; + if (((now - se->avg.hmp_last_up_migration) >> 10) + < hmp_next_up_threshold) + return 0; + + /* hmp_domain_min_load only returns 0 for an + * idle CPU or 1023 for any partly-busy one. + * Be explicit about requirement for an idle CPU. + */ + if (hmp_domain_min_load(hmp_faster_domain(cpu), &temp_target_cpu, + tsk_cpus_allowed(p)) == 0 && temp_target_cpu != NR_CPUS) { + if(target_cpu) + *target_cpu = temp_target_cpu; + return 1; + } + return 0; +} + +/* Check if task should migrate to a slower cpu */ +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + u64 now; + + if (hmp_cpu_is_slowest(cpu)) { +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING + if(hmp_packing_enabled) + return 1; + else +#endif + return 0; + } + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if ((p->prio >= hmp_up_prio) && + cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p))) { + return 1; + } +#endif + + /* Let the task load settle before doing another down migration */ + /* hack - always use clock from first online CPU */ + now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task; + if (((now - se->avg.hmp_last_down_migration) >> 10) + < hmp_next_down_threshold) + return 0; + + if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p)) + && se->avg.load_avg_ratio < hmp_down_threshold) { + return 1; + } + return 0; +} + +/* + * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + * Ideally this function should be merged with can_migrate_task() to avoid + * redundant code. + */ +static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + int tsk_cache_hot = 0; + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed + */ + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + return 0; + } + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + tsk_cache_hot = task_hot(p, env); + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + + return 1; +} + +/* + * move_task - move a task from one runqueue to another runqueue. + * Both runqueues must be locked. + */ +static void move_task(struct task_struct *p, struct lb_env *env) +{ + deactivate_task(env->src_rq, p, 0); + set_task_cpu(p, env->dst_cpu); + activate_task(env->dst_rq, p, 0); + check_preempt_curr(env->dst_rq, p, 0); +} + +/* + * move_specific_task tries to move a specific task. + * Returns 1 if successful and 0 otherwise. + * Called with both runqueues locked. + */ +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + struct task_struct *p, *n; + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), + env->src_rq->cpu, env->dst_cpu)) + continue; + + if (!hmp_can_migrate_task(p, env)) + continue; + /* Check if we found the right task */ + if (p != pm) + continue; + + move_task(p, env); + /* + * Right now, this is only the third place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; + } + return 0; +} + +/* + * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to + * migrate a specific task from one runqueue to another. + * hmp_force_up_migration uses this to push a currently running task + * off a runqueue. hmp_idle_pull uses this to pull a currently + * running task to an idle runqueue. + * Reuses __do_active_load_balance_cpu_stop to actually do the work. + */ +static int hmp_active_task_migration_cpu_stop(void *data) +{ + return __do_active_load_balance_cpu_stop(data, true); +} + +/* + * Move task in a runnable state to another CPU. + * + * Tailored on 'active_load_balance_cpu_stop' with slight + * modification to locking and pre-transfer checks. Note + * rq->lock must be held before calling. + */ +static void hmp_migrate_runnable_task(struct rq *rq) +{ + struct sched_domain *sd; + int src_cpu = cpu_of(rq); + struct rq *src_rq = rq; + int dst_cpu = rq->push_cpu; + struct rq *dst_rq = cpu_rq(dst_cpu); + struct task_struct *p = rq->migrate_task; + /* + * One last check to make sure nobody else is playing + * with the source rq. + */ + if (src_rq->active_balance) + goto out; + + if (src_rq->nr_running <= 1) + goto out; + + if (task_rq(p) != src_rq) + goto out; + /* + * Not sure if this applies here but one can never + * be too cautious + */ + BUG_ON(src_rq == dst_rq); + + double_lock_balance(src_rq, dst_rq); + + rcu_read_lock(); + for_each_domain(dst_cpu, sd) { + if (cpumask_test_cpu(src_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = dst_cpu, + .dst_rq = dst_rq, + .src_cpu = src_cpu, + .src_rq = src_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_specific_task(&env, p)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + + rcu_read_unlock(); + double_unlock_balance(src_rq, dst_rq); +out: + put_task_struct(p); +} + +static DEFINE_SPINLOCK(hmp_force_migration); + +/* + * hmp_force_up_migration checks runqueues for tasks that need to + * be actively migrated to a faster cpu. + */ +static void hmp_force_up_migration(int this_cpu) +{ + int cpu, target_cpu; + struct sched_entity *curr, *orig; + struct rq *target; + unsigned long flags; + unsigned int force, got_target; + struct task_struct *p; + + if (!spin_trylock(&hmp_force_migration)) + return; + for_each_online_cpu(cpu) { + force = 0; + got_target = 0; + target = cpu_rq(cpu); + raw_spin_lock_irqsave(&target->lock, flags); + curr = target->cfs.curr; + if (!curr || target->active_balance) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + if (!entity_is_task(curr)) { + struct cfs_rq *cfs_rq; + + cfs_rq = group_cfs_rq(curr); + while (cfs_rq) { + curr = cfs_rq->curr; + cfs_rq = group_cfs_rq(curr); + } + } + orig = curr; + curr = hmp_get_heaviest_task(curr, -1); + if (!curr) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + p = task_of(curr); + if (hmp_up_migration(cpu, &target_cpu, curr)) { + cpu_rq(target_cpu)->wake_for_idle_pull = 1; + raw_spin_unlock_irqrestore(&target->lock, flags); + spin_unlock(&hmp_force_migration); + smp_send_reschedule(target_cpu); + return; + } + if (!got_target) { + /* + * For now we just check the currently running task. + * Selecting the lightest task for offloading will + * require extensive book keeping. + */ + curr = hmp_get_lightest_task(orig, 1); + p = task_of(curr); + target->push_cpu = hmp_offload_down(cpu, curr); + if (target->push_cpu < NR_CPUS) { + get_task_struct(p); + target->migrate_task = p; + got_target = 1; + trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD); + hmp_next_down_delay(&p->se, target->push_cpu); + } + } + /* + * We have a target with no active_balance. If the task + * is not currently running move it, otherwise let the + * CPU stopper take care of it. + */ + if (got_target) { + if (!task_running(target, p)) { + trace_sched_hmp_migrate_force_running(p, 0); + hmp_migrate_runnable_task(target); + } else { + target->active_balance = 1; + force = 1; + } + } + + raw_spin_unlock_irqrestore(&target->lock, flags); + + if (force) + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + spin_unlock(&hmp_force_migration); +} + +/* + * hmp_idle_pull looks at little domain runqueues to see + * if a task should be pulled. + * + * Reuses hmp_force_migration spinlock. + * + */ +static unsigned int hmp_idle_pull(int this_cpu) +{ + int cpu; + struct sched_entity *curr, *orig; + struct hmp_domain *hmp_domain = NULL; + struct rq *target = NULL, *rq; + unsigned long flags, ratio = 0; + unsigned int force = 0; + struct task_struct *p = NULL; + + if (!hmp_cpu_is_slowest(this_cpu)) + hmp_domain = hmp_slower_domain(this_cpu); + if (!hmp_domain) + return 0; + + if (!spin_trylock(&hmp_force_migration)) + return 0; + + /* first select a task */ + for_each_cpu(cpu, &hmp_domain->cpus) { + rq = cpu_rq(cpu); + raw_spin_lock_irqsave(&rq->lock, flags); + curr = rq->cfs.curr; + if (!curr) { + raw_spin_unlock_irqrestore(&rq->lock, flags); + continue; + } + if (!entity_is_task(curr)) { + struct cfs_rq *cfs_rq; + + cfs_rq = group_cfs_rq(curr); + while (cfs_rq) { + curr = cfs_rq->curr; + if (!entity_is_task(curr)) + cfs_rq = group_cfs_rq(curr); + else + cfs_rq = NULL; + } + } + orig = curr; + curr = hmp_get_heaviest_task(curr, this_cpu); + /* check if heaviest eligible task on this + * CPU is heavier than previous task + */ + if (curr && hmp_task_eligible_for_up_migration(curr) && + curr->avg.load_avg_ratio > ratio && + cpumask_test_cpu(this_cpu, + tsk_cpus_allowed(task_of(curr)))) { + p = task_of(curr); + target = rq; + ratio = curr->avg.load_avg_ratio; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + if (!p) + goto done; + + /* now we have a candidate */ + raw_spin_lock_irqsave(&target->lock, flags); + if (!target->active_balance && task_rq(p) == target) { + get_task_struct(p); + target->push_cpu = this_cpu; + target->migrate_task = p; + trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL); + hmp_next_up_delay(&p->se, target->push_cpu); + /* + * if the task isn't running move it right away. + * Otherwise setup the active_balance mechanic and let + * the CPU stopper do its job. + */ + if (!task_running(target, p)) { + trace_sched_hmp_migrate_idle_running(p, 0); + hmp_migrate_runnable_task(target); + } else { + target->active_balance = 1; + force = 1; + } + } + raw_spin_unlock_irqrestore(&target->lock, flags); + + if (force) { + /* start timer to keep us awake */ + hmp_cpu_keepalive_trigger(); + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } +done: + spin_unlock(&hmp_force_migration); + return force; +} + +#else +static void hmp_force_up_migration(int this_cpu) { } +#endif /* CONFIG_SCHED_HMP */ + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). @@ -7477,6 +9228,20 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; +#ifdef CONFIG_SCHED_HMP + /* shortcut for hmp idle pull wakeups */ + if (unlikely(this_rq->wake_for_idle_pull)) { + this_rq->wake_for_idle_pull = 0; + if (hmp_idle_pull(cpu_of(this_rq))) { + /* break out unless running nohz idle as well */ + if (idle != CPU_IDLE) + return; + } + } +#endif + + hmp_force_up_migration(cpu_of(this_rq)); + rebalance_domains(this_rq, idle); /* @@ -7500,12 +9265,15 @@ void trigger_load_balance(struct rq *rq) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON if (nohz_kick_needed(rq)) - nohz_balancer_kick(); + nohz_balancer_kick(cpu_of(rq)); #endif } static void rq_online_fair(struct rq *rq) { +#ifdef CONFIG_SCHED_HMP + hmp_online_cpu(rq->cpu); +#endif update_sysctl(); update_runtime_enabled(rq); @@ -7513,6 +9281,9 @@ static void rq_online_fair(struct rq *rq) static void rq_offline_fair(struct rq *rq) { +#ifdef CONFIG_SCHED_HMP + hmp_offline_cpu(rq->cpu); +#endif update_sysctl(); /* Ensure any throttled groups are reachable by pick_next_task */ @@ -7996,6 +9767,139 @@ __init void init_sched_fair_class(void) zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif + +#ifdef CONFIG_SCHED_HMP + hmp_cpu_mask_setup(); +#endif #endif /* SMP */ } + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr) +{ + u32 result = curr / max; + return result; +} + +/* Called when the CPU Frequency is changed. + * Once for each CPU. + */ +static int cpufreq_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + int cpu = freq->cpu; + struct cpufreq_extents *extents; + + if (freq->flags & CPUFREQ_CONST_LOOPS) + return NOTIFY_OK; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_OK; + + /* if dynamic load scale is disabled, set the load scale to 1.0 */ + if (!hmp_data.freqinvar_load_scale_enabled) { + freq_scale[cpu].curr_scale = 1024; + return NOTIFY_OK; + } + + extents = &freq_scale[cpu]; + if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) { + /* If our governor was recognised as a single-freq governor, + * use 1.0 + */ + extents->curr_scale = 1024; + } else { + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, freq->new); + } + + return NOTIFY_OK; +} + +/* Called when the CPUFreq governor is changed. + * Only called for the CPUs which are actually changed by the + * userspace. + */ +static int cpufreq_policy_callback(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct cpufreq_policy *policy = data; + struct cpufreq_extents *extents; + int cpu, singleFreq = 0; + static const char performance_governor[] = "performance"; + static const char powersave_governor[] = "powersave"; + + if (event == CPUFREQ_START) + return 0; + + if (event != CPUFREQ_INCOMPATIBLE) + return 0; + + /* CPUFreq governors do not accurately report the range of + * CPU Frequencies they will choose from. + * We recognise performance and powersave governors as + * single-frequency only. + */ + if (!strncmp(policy->governor->name, performance_governor, + strlen(performance_governor)) || + !strncmp(policy->governor->name, powersave_governor, + strlen(powersave_governor))) + singleFreq = 1; + + /* Make sure that all CPUs impacted by this policy are + * updated since we will only get a notification when the + * user explicitly changes the policy on a CPU. + */ + for_each_cpu(cpu, policy->cpus) { + extents = &freq_scale[cpu]; + extents->max = policy->max >> SCHED_FREQSCALE_SHIFT; + extents->min = policy->min >> SCHED_FREQSCALE_SHIFT; + if (!hmp_data.freqinvar_load_scale_enabled) { + extents->curr_scale = 1024; + } else if (singleFreq) { + extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = 1024; + } else { + extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, policy->cur); + } + } + + return 0; +} + +static struct notifier_block cpufreq_notifier = { + .notifier_call = cpufreq_callback, +}; +static struct notifier_block cpufreq_policy_notifier = { + .notifier_call = cpufreq_policy_callback, +}; + +static int __init register_sched_cpufreq_notifier(void) +{ + int ret = 0; + + /* init safe defaults since there are no policies at registration */ + for (ret = 0; ret < CONFIG_NR_CPUS; ret++) { + /* safe defaults */ + freq_scale[ret].max = 1024; + freq_scale[ret].min = 1024; + freq_scale[ret].curr_scale = 1024; + } + + pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n"); + ret = cpufreq_register_notifier(&cpufreq_policy_notifier, + CPUFREQ_POLICY_NOTIFIER); + + if (ret != -EINVAL) + ret = cpufreq_register_notifier(&cpufreq_notifier, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} + +core_initcall(register_sched_cpufreq_notifier); +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ |