diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 1937 |
1 files changed, 1881 insertions, 56 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03b73bea33d6..41d0cbda605d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -31,9 +31,21 @@ #include <linux/task_work.h> #include <trace/events/sched.h> +#include <linux/sysfs.h> +#include <linux/vmalloc.h> +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Include cpufreq header to add a notifier so that cpu frequency + * scaling can track the current CPU frequency + */ +#include <linux/cpufreq.h> +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#ifdef CONFIG_SCHED_HMP +#include <linux/cpuidle.h> +#endif #include "sched.h" + /* * Targeted preemption latency for CPU-bound tasks: * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) @@ -936,6 +948,13 @@ void task_numa_work(struct callback_head *work) if (vma->vm_end - vma->vm_start < HPAGE_SIZE) continue; + /* + * Skip inaccessible VMAs to avoid any confusion between + * PROT_NONE and NUMA hinting ptes + */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + continue; + do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); @@ -1201,8 +1220,91 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -/* - * We can represent the historical contribution to runnable average as the +#ifdef CONFIG_SCHED_HMP +#define HMP_VARIABLE_SCALE_SHIFT 16ULL +struct hmp_global_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *a, struct attribute *b, + const char *c, size_t count); + int *value; + int (*to_sysfs)(int); + int (*from_sysfs)(int); + ssize_t (*to_sysfs_text)(char *buf, int buf_size); +}; + +#define HMP_DATA_SYSFS_MAX 8 + +struct hmp_data_struct { +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + int freqinvar_load_scale_enabled; +#endif + int multiplier; /* used to scale the time delta */ + struct attribute_group attr_group; + struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1]; + struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX]; +} hmp_data; + +static u64 hmp_variable_scale_convert(u64 delta); +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Frequency-Invariant Load Modification: + * Loads are calculated as in PJT's patch however we also scale the current + * contribution in line with the frequency of the CPU that the task was + * executed on. + * In this version, we use a simple linear scale derived from the maximum + * frequency reported by CPUFreq. As an example: + * + * Consider that we ran a task for 100% of the previous interval. + * + * Our CPU was under asynchronous frequency control through one of the + * CPUFreq governors. + * + * The CPUFreq governor reports that it is able to scale the CPU between + * 500MHz and 1GHz. + * + * During the period, the CPU was running at 1GHz. + * + * In this case, our load contribution for that period is calculated as + * 1 * (number_of_active_microseconds) + * + * This results in our task being able to accumulate maximum load as normal. + * + * + * Consider now that our CPU was executing at 500MHz. + * + * We now scale the load contribution such that it is calculated as + * 0.5 * (number_of_active_microseconds) + * + * Our task can only record 50% maximum load during this period. + * + * This represents the task consuming 50% of the CPU's *possible* compute + * capacity. However the task did consume 100% of the CPU's *available* + * compute capacity which is the value seen by the CPUFreq governor and + * user-side CPU Utilization tools. + * + * Restricting tracked load to be scaled by the CPU's frequency accurately + * represents the consumption of possible compute capacity and allows the + * HMP migration's simple threshold migration strategy to interact more + * predictably with CPUFreq's asynchronous compute capacity changes. + */ +#define SCHED_FREQSCALE_SHIFT 10 +struct cpufreq_extents { + u32 curr_scale; + u32 min; + u32 max; + u32 flags; +}; +/* Flag set when the governor in use only allows one frequency. + * Disables scaling. + */ +#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01 + +static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS]; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_SCHED_HMP */ + +/* We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable * history into segments of approximately 1ms (1024us); label the segment that * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. @@ -1231,13 +1333,24 @@ static u32 __compute_runnable_contrib(u64 n) */ static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, - int runnable) + int runnable, + int running, + int cpu) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + u64 scaled_delta; + u32 scaled_runnable_contrib; + int scaled_delta_w; + u32 curr_scale = 1024; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ delta = now - sa->last_runnable_update; +#ifdef CONFIG_SCHED_HMP + delta = hmp_variable_scale_convert(delta); +#endif /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. @@ -1256,6 +1369,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return 0; sa->last_runnable_update = now; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* retrieve scale factor for load */ + if (hmp_data.freqinvar_load_scale_enabled) + curr_scale = freq_scale[cpu].curr_scale; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + /* delta_w is the amount already accumulated against our next period */ delta_w = sa->runnable_avg_period % 1024; if (delta + delta_w >= 1024) { @@ -1268,8 +1387,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, * period and accrue it. */ delta_w = 1024 - delta_w; + /* scale runnable time if necessary */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_delta_w = (delta_w * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_delta_w; + if (running) + sa->usage_avg_sum += scaled_delta_w; +#else if (runnable) sa->runnable_avg_sum += delta_w; + if (running) + sa->usage_avg_sum += delta_w; +#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += delta_w; delta -= delta_w; @@ -1277,22 +1408,49 @@ static __always_inline int __update_entity_runnable_avg(u64 now, /* Figure out how many additional periods this update spans */ periods = delta / 1024; delta %= 1024; - + /* decay the load we have accumulated so far */ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); sa->runnable_avg_period = decay_load(sa->runnable_avg_period, periods + 1); - + sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); + /* add the contribution from this period */ /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); + /* Apply load scaling if necessary. + * Note that multiplying the whole series is same as + * multiplying all terms + */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_runnable_contrib = (runnable_contrib * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_runnable_contrib; + if (running) + sa->usage_avg_sum += scaled_runnable_contrib; +#else if (runnable) sa->runnable_avg_sum += runnable_contrib; + if (running) + sa->usage_avg_sum += runnable_contrib; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ + /* scale if necessary */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT); + if (runnable) + sa->runnable_avg_sum += scaled_delta; + if (running) + sa->usage_avg_sum += scaled_delta; +#else if (runnable) sa->runnable_avg_sum += delta; + if (running) + sa->usage_avg_sum += delta; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += delta; return decayed; @@ -1305,12 +1463,9 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; - if (!decays) - return 0; - - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + if (decays) + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); se->avg.decay_count = 0; - return decays; } @@ -1338,16 +1493,28 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; - long contrib; + long contrib, usage_contrib; /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { + usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + usage_contrib -= cfs_rq->tg_usage_contrib; + + /* + * contrib/usage at this point represent deltas, only update if they + * are substantive. + */ + if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) || + (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) { atomic_add(contrib, &tg->runnable_avg); cfs_rq->tg_runnable_contrib += contrib; + + atomic_add(usage_contrib, &tg->usage_avg); + cfs_rq->tg_usage_contrib += usage_contrib; } } @@ -1408,12 +1575,18 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); contrib /= (se->avg.runnable_avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); + trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib); + contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_ratio = scale_load(contrib); + trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio); } /* Compute the current contribution to load_avg by se, return any delta */ -static long __update_entity_load_avg_contrib(struct sched_entity *se) +static long __update_entity_load_avg_contrib(struct sched_entity *se, long *ratio) { long old_contrib = se->avg.load_avg_contrib; + long old_ratio = se->avg.load_avg_ratio; if (entity_is_task(se)) { __update_task_entity_contrib(se); @@ -1422,6 +1595,8 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) __update_group_entity_contrib(se); } + if (ratio) + *ratio = se->avg.load_avg_ratio - old_ratio; return se->avg.load_avg_contrib - old_contrib; } @@ -1441,9 +1616,13 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, ratio_delta; u64 now; + int cpu = -1; /* not used in normal case */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + cpu = cfs_rq->rq->cpu; +#endif /* * For a group entity we need to use their owned cfs_rq_clock_task() in * case they are the parent of a throttled hierarchy. @@ -1453,18 +1632,21 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se, cpu)) return; - contrib_delta = __update_entity_load_avg_contrib(se); + contrib_delta = __update_entity_load_avg_contrib(se, &ratio_delta); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else + rq_of(cfs_rq)->avg.load_avg_ratio += ratio_delta; + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -1497,8 +1679,17 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + int cpu = -1; /* not used in normal case */ + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + cpu = rq->cpu; +#endif + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, + runnable, cpu); __update_tg_runnable_avg(&rq->avg, &rq->cfs); + trace_sched_rq_runnable_ratio(cpu_of(rq), rq->avg.load_avg_ratio); + trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg); + trace_sched_rq_nr_running(cpu_of(rq), rq->nr_running, rq->nr_iowait.counter); } /* Add the load generated by se into cfs_rq's child load-average */ @@ -1540,6 +1731,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + rq_of(cfs_rq)->avg.load_avg_ratio += se->avg.load_avg_ratio; + /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -1558,6 +1751,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + rq_of(cfs_rq)->avg.load_avg_ratio -= se->avg.load_avg_ratio; + if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -1886,6 +2081,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -2022,13 +2218,14 @@ static inline bool cfs_bandwidth_used(void) return static_key_false(&__cfs_bandwidth_used); } -void account_cfs_bandwidth_used(int enabled, int was_enabled) +void cfs_bandwidth_usage_inc(void) { - /* only need to count groups transitioning between enabled/!enabled */ - if (enabled && !was_enabled) - static_key_slow_inc(&__cfs_bandwidth_used); - else if (!enabled && was_enabled) - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_dec(void) +{ + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -2036,7 +2233,8 @@ static bool cfs_bandwidth_used(void) return true; } -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +void cfs_bandwidth_usage_inc(void) {} +void cfs_bandwidth_usage_dec(void) {} #endif /* HAVE_JUMP_LABEL */ /* @@ -2288,6 +2486,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled_clock = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (!cfs_b->timer_active) + __start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); } @@ -2399,6 +2599,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (idle) goto out_unlock; + /* + * if we have relooped after returning idle once, we need to update our + * status as actually running, so that other cpus doing + * __start_cfs_bandwidth will stop trying to cancel us. + */ + cfs_b->timer_active = 1; + __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -2459,7 +2666,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; /* how long we wait to gather additional slack before distributing */ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* are we near the end of the current quota period? */ +/* + * Are we near the end of the current quota period? + * + * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the + * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * migrate_hrtimers, base is never cleared, so we are fine. + */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -2535,10 +2748,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) u64 expires; /* confirm we're still not at a refresh boundary */ - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + raw_spin_lock(&cfs_b->lock); + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { + raw_spin_unlock(&cfs_b->lock); return; + } - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { runtime = cfs_b->runtime; cfs_b->runtime = 0; @@ -2663,11 +2878,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * (timer_active==0 becomes visible before the hrtimer call-back * terminates). In either case we ensure that it's re-programmed */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + while (unlikely(hrtimer_active(&cfs_b->period_timer)) && + hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { + /* bounce the lock to allow do_sched_cfs_period_timer to run */ raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - + cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ if (cfs_b->timer_active) @@ -3315,6 +3530,835 @@ done: return target; } +#ifdef CONFIG_SCHED_HMP +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * The cpu types are distinguished using a list of hmp_domains + * which each represent one cpu type using a cpumask. + * The list is assumed ordered by compute capacity with the + * fastest domain first. + */ +DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain); +static const int hmp_max_tasks = 5; + +extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list); + +#ifdef CONFIG_CPU_IDLE +/* + * hmp_idle_pull: + * + * In this version we have stopped using forced up migrations when we + * detect that a task running on a little CPU should be moved to a bigger + * CPU. In most cases, the bigger CPU is in a deep sleep state and a forced + * migration means we stop the task immediately but need to wait for the + * target CPU to wake up before we can restart the task which is being + * moved. Instead, we now wake a big CPU with an IPI and ask it to pull + * a task when ready. This allows the task to continue executing on its + * current CPU, reducing the amount of time that the task is stalled for. + * + * keepalive timers: + * + * The keepalive timer is used as a way to keep a CPU engaged in an + * idle pull operation out of idle while waiting for the source + * CPU to stop and move the task. Ideally this would not be necessary + * and we could impose a temporary zero-latency requirement on the + * current CPU, but in the current QoS framework this will result in + * all CPUs in the system being unable to enter idle states which is + * not desirable. The timer does not perform any work when it expires. + */ +struct hmp_keepalive { + bool init; + ktime_t delay; /* if zero, no need for timer */ + struct hrtimer timer; +}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive); + +/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +/* + * Work out if any of the idle states have an exit latency too high for us. + * ns_delay is passed in containing the max we are willing to tolerate. + * If there are none, set ns_delay to zero. + * If there are any, set ns_delay to + * ('target_residency of state with shortest too-big latency' - 1) * 1000. + */ +static void hmp_keepalive_delay(int cpu, unsigned int *ns_delay) +{ + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); + struct cpuidle_driver *drv; + + drv = cpuidle_get_cpu_driver(dev); + if (drv) { + unsigned int us_delay = UINT_MAX; + unsigned int us_max_delay = *ns_delay / 1000; + int idx; + /* if cpuidle states are guaranteed to be sorted we + * could stop at the first match. + */ + for (idx = 0; idx < drv->state_count; idx++) { + if (drv->states[idx].exit_latency > us_max_delay && + drv->states[idx].target_residency < us_delay) { + us_delay = drv->states[idx].target_residency; + } + } + if (us_delay == UINT_MAX) + *ns_delay = 0; /* no timer required */ + else + *ns_delay = 1000 * (us_delay - 1); + } +} + +static void hmp_cpu_keepalive_trigger(void) +{ + int cpu = smp_processor_id(); + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (!keepalive->init) { + unsigned int ns_delay = 100000; /* tolerate 100usec delay */ + + hrtimer_init(&keepalive->timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + keepalive->timer.function = hmp_cpu_keepalive_notify; + + hmp_keepalive_delay(cpu, &ns_delay); + keepalive->delay = ns_to_ktime(ns_delay); + keepalive->init = true; + } + if (ktime_to_ns(keepalive->delay)) + hrtimer_start(&keepalive->timer, + keepalive->delay, HRTIMER_MODE_REL_PINNED); +} + +static void hmp_cpu_keepalive_cancel(int cpu) +{ + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (keepalive->init) + hrtimer_cancel(&keepalive->timer); +} +#else /* !CONFIG_CPU_IDLE */ +static void hmp_cpu_keepalive_trigger(void) +{ +} + +static void hmp_cpu_keepalive_cancel(int cpu) +{ +} +#endif + +/* Setup hmp_domains */ +static int __init hmp_cpu_mask_setup(void) +{ + char buf[64]; + struct hmp_domain *domain; + struct list_head *pos; + int dc, cpu; + + pr_debug("Initializing HMP scheduler:\n"); + + /* Initialize hmp_domains using platform code */ + arch_get_hmp_domains(&hmp_domains); + if (list_empty(&hmp_domains)) { + pr_debug("HMP domain list is empty!\n"); + return 0; + } + + /* Print hmp_domains */ + dc = 0; + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + cpulist_scnprintf(buf, 64, &domain->possible_cpus); + pr_debug(" HMP domain %d: %s\n", dc, buf); + + for_each_cpu_mask(cpu, domain->possible_cpus) { + per_cpu(hmp_cpu_domain, cpu) = domain; + } + dc++; + } + + return 1; +} + +static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu) +{ + struct hmp_domain *domain; + struct list_head *pos; + + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + if(cpumask_test_cpu(cpu, &domain->possible_cpus)) + return domain; + } + return NULL; +} + +static void hmp_online_cpu(int cpu) +{ + struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + + if(domain) + cpumask_set_cpu(cpu, &domain->cpus); +} + +static void hmp_offline_cpu(int cpu) +{ + struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + + if(domain) + cpumask_clear_cpu(cpu, &domain->cpus); + + hmp_cpu_keepalive_cancel(cpu); +} +/* + * Needed to determine heaviest tasks etc. + */ +static inline unsigned int hmp_cpu_is_fastest(int cpu); +static inline unsigned int hmp_cpu_is_slowest(int cpu); +static inline struct hmp_domain *hmp_slower_domain(int cpu); +static inline struct hmp_domain *hmp_faster_domain(int cpu); + +/* must hold runqueue lock for queue se is currently on */ +static struct sched_entity *hmp_get_heaviest_task( + struct sched_entity *se, int target_cpu) +{ + int num_tasks = hmp_max_tasks; + struct sched_entity *max_se = se; + unsigned long int max_ratio = se->avg.load_avg_ratio; + const struct cpumask *hmp_target_mask = NULL; + struct hmp_domain *hmp; + + if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) + return max_se; + + hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); + hmp_target_mask = &hmp->cpus; + if (target_cpu >= 0) { + /* idle_balance gets run on a CPU while + * it is in the middle of being hotplugged + * out. Bail early in that case. + */ + if(!cpumask_test_cpu(target_cpu, hmp_target_mask)) + return NULL; + hmp_target_mask = cpumask_of(target_cpu); + } + /* The currently running task is not on the runqueue */ + se = __pick_first_entity(cfs_rq_of(se)); + + while (num_tasks && se) { + if (entity_is_task(se) && + se->avg.load_avg_ratio > max_ratio && + cpumask_intersects(hmp_target_mask, + tsk_cpus_allowed(task_of(se)))) { + max_se = se; + max_ratio = se->avg.load_avg_ratio; + } + se = __pick_next_entity(se); + num_tasks--; + } + return max_se; +} + +static struct sched_entity *hmp_get_lightest_task( + struct sched_entity *se, int migrate_down) +{ + int num_tasks = hmp_max_tasks; + struct sched_entity *min_se = se; + unsigned long int min_ratio = se->avg.load_avg_ratio; + const struct cpumask *hmp_target_mask = NULL; + + if (migrate_down) { + struct hmp_domain *hmp; + if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq))) + return min_se; + hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq)); + hmp_target_mask = &hmp->cpus; + } + /* The currently running task is not on the runqueue */ + se = __pick_first_entity(cfs_rq_of(se)); + + while (num_tasks && se) { + if (entity_is_task(se) && + (se->avg.load_avg_ratio < min_ratio && + hmp_target_mask && + cpumask_intersects(hmp_target_mask, + tsk_cpus_allowed(task_of(se))))) { + min_se = se; + min_ratio = se->avg.load_avg_ratio; + } + se = __pick_next_entity(se); + num_tasks--; + } + return min_se; +} + +/* + * Migration thresholds should be in the range [0..1023] + * hmp_up_threshold: min. load required for migrating tasks to a faster cpu + * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu + * + * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio) + * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms) + * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms) + * + * Small Task Packing: + * We can choose to fill the littlest CPUs in an HMP system rather than + * the typical spreading mechanic. This behavior is controllable using + * two variables. + * hmp_packing_enabled: runtime control over pack/spread + * hmp_full_threshold: Consider a CPU with this much unweighted load full + */ +unsigned int hmp_up_threshold = 700; +unsigned int hmp_down_threshold = 512; +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER +unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL); +#endif +unsigned int hmp_next_up_threshold = 4096; +unsigned int hmp_next_down_threshold = 4096; + +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING +/* + * Set the default packing threshold to try to keep little + * CPUs at no more than 80% of their maximum frequency if only + * packing a small number of small tasks. Bigger tasks will + * raise frequency as normal. + * In order to pack a task onto a CPU, the sum of the + * unweighted runnable_avg load of existing tasks plus the + * load of the new task must be less than hmp_full_threshold. + * + * This works in conjunction with frequency-invariant load + * and DVFS governors. Since most DVFS governors aim for 80% + * utilisation, we arrive at (0.8*0.8*(max_load=1024))=655 + * and use a value slightly lower to give a little headroom + * in the decision. + * Note that the most efficient frequency is different for + * each system so /sys/kernel/hmp/packing_limit should be + * configured at runtime for any given platform to achieve + * optimal energy usage. Some systems may not benefit from + * packing, so this feature can also be disabled at runtime + * with /sys/kernel/hmp/packing_enable + */ +unsigned int hmp_packing_enabled = 1; +unsigned int hmp_full_threshold = 650; +#endif + +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se); +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu, struct cpumask *affinity); + +static inline struct hmp_domain *hmp_smallest_domain(void) +{ + return list_entry(hmp_domains.prev, struct hmp_domain, hmp_domains); +} + +/* Check if cpu is in fastest hmp_domain */ +static inline unsigned int hmp_cpu_is_fastest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return pos == hmp_domains.next; +} + +/* Check if cpu is in slowest hmp_domain */ +static inline unsigned int hmp_cpu_is_slowest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_is_last(pos, &hmp_domains); +} + +/* Next (slower) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_slower_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->next, struct hmp_domain, hmp_domains); +} + +/* Previous (faster) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_faster_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->prev, struct hmp_domain, hmp_domains); +} + +/* + * Selects a cpu in previous (faster) hmp_domain + */ +static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk, + int cpu) +{ + int lowest_cpu=NR_CPUS; + __always_unused int lowest_ratio; + struct hmp_domain *hmp; + + if (hmp_cpu_is_fastest(cpu)) + hmp = hmp_cpu_domain(cpu); + else + hmp = hmp_faster_domain(cpu); + + lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu, + tsk_cpus_allowed(tsk)); + + return lowest_cpu; +} + +/* + * Selects a cpu in next (slower) hmp_domain + * Note that cpumask_any_and() returns the first cpu in the cpumask + */ +static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk, + int cpu) +{ + int lowest_cpu=NR_CPUS; + struct hmp_domain *hmp; + __always_unused int lowest_ratio; + + if (hmp_cpu_is_slowest(cpu)) + hmp = hmp_cpu_domain(cpu); + else + hmp = hmp_slower_domain(cpu); + + lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu, + tsk_cpus_allowed(tsk)); + + return lowest_cpu; +} +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING +/* + * Select the 'best' candidate little CPU to wake up on. + * Implements a packing strategy which examines CPU in + * logical CPU order, and selects the first which will + * be loaded less than hmp_full_threshold according to + * the sum of the tracked load of the runqueue and the task. + */ +static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk, + int cpu) { + int tmp_cpu; + unsigned long estimated_load; + struct hmp_domain *hmp; + struct sched_avg *avg; + struct cpumask allowed_hmp_cpus; + + if(!hmp_packing_enabled || + tsk->se.avg.load_avg_ratio > ((NICE_0_LOAD * 90)/100)) + return hmp_select_slower_cpu(tsk, cpu); + + if (hmp_cpu_is_slowest(cpu)) + hmp = hmp_cpu_domain(cpu); + else + hmp = hmp_slower_domain(cpu); + + /* respect affinity */ + cpumask_and(&allowed_hmp_cpus, &hmp->cpus, + tsk_cpus_allowed(tsk)); + + for_each_cpu_mask(tmp_cpu, allowed_hmp_cpus) { + avg = &cpu_rq(tmp_cpu)->avg; + /* estimate new rq load if we add this task */ + estimated_load = avg->load_avg_ratio + + tsk->se.avg.load_avg_ratio; + if (estimated_load <= hmp_full_threshold) { + cpu = tmp_cpu; + break; + } + } + /* if no match was found, the task uses the initial value */ + return cpu; +} +#endif +static inline void hmp_next_up_delay(struct sched_entity *se, int cpu) +{ + /* hack - always use clock from first online CPU */ + u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task; + se->avg.hmp_last_up_migration = now; + se->avg.hmp_last_down_migration = 0; + cpu_rq(cpu)->avg.hmp_last_up_migration = now; + cpu_rq(cpu)->avg.hmp_last_down_migration = 0; +} + +static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) +{ + /* hack - always use clock from first online CPU */ + u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task; + se->avg.hmp_last_down_migration = now; + se->avg.hmp_last_up_migration = 0; + cpu_rq(cpu)->avg.hmp_last_down_migration = now; + cpu_rq(cpu)->avg.hmp_last_up_migration = 0; +} + +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * These functions allow to change the growing speed of the load_avg_ratio + * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms + * This can now be changed with /sys/kernel/hmp/load_avg_period_ms. + * + * These functions also allow to change the up and down threshold of HMP + * using /sys/kernel/hmp/{up,down}_threshold. + * Both must be between 0 and 1023. The threshold that is compared + * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024. + * + * For instance, if load_avg_period = 64 and up_threshold = 512, an idle + * task with a load of 0 will reach the threshold after 64ms of busy loop. + * + * Changing load_avg_periods_ms has the same effect than changing the + * default scaling factor Y=1002/1024 in the load_avg_ratio computation to + * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one + * could trigger overflows. + * For instance, with Y = 1023/1024 in __update_task_entity_contrib() + * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);" + * could be overflowed for a weight > 2^12 even is the load_avg_contrib + * should still be a 32bits result. This would not happen by multiplicating + * delta time by 1/22 and setting load_avg_period_ms = 706. + */ + +/* + * By scaling the delta time it end-up increasing or decrease the + * growing speed of the per entity load_avg_ratio + * The scale factor hmp_data.multiplier is a fixed point + * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT + */ +static inline u64 hmp_variable_scale_convert(u64 delta) +{ +#ifdef CONFIG_HMP_VARIABLE_SCALE + u64 high = delta >> 32ULL; + u64 low = delta & 0xffffffffULL; + low *= hmp_data.multiplier; + high *= hmp_data.multiplier; + return (low >> HMP_VARIABLE_SCALE_SHIFT) + + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT)); +#else + return delta; +#endif +} + +static ssize_t hmp_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + int temp; + + if (hmp_attr->to_sysfs_text != NULL) + return hmp_attr->to_sysfs_text(buf, PAGE_SIZE); + + temp = *(hmp_attr->value); + if (hmp_attr->to_sysfs != NULL) + temp = hmp_attr->to_sysfs(temp); + + return (ssize_t)sprintf(buf, "%d\n", temp); +} + +static ssize_t hmp_store(struct kobject *a, struct attribute *attr, + const char *buf, size_t count) +{ + int temp; + ssize_t ret = count; + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + char *str = vmalloc(count + 1); + if (str == NULL) + return -ENOMEM; + memcpy(str, buf, count); + str[count] = 0; + if (sscanf(str, "%d", &temp) < 1) + ret = -EINVAL; + else { + if (hmp_attr->from_sysfs != NULL) + temp = hmp_attr->from_sysfs(temp); + if (temp < 0) + ret = -EINVAL; + else + *(hmp_attr->value) = temp; + } + vfree(str); + return ret; +} + +static ssize_t hmp_print_domains(char *outbuf, int outbufsize) +{ + char buf[64]; + const char nospace[] = "%s", space[] = " %s"; + const char *fmt = nospace; + struct hmp_domain *domain; + struct list_head *pos; + int outpos = 0; + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + if (cpumask_scnprintf(buf, 64, &domain->possible_cpus)) { + outpos += sprintf(outbuf+outpos, fmt, buf); + fmt = space; + } + } + strcat(outbuf, "\n"); + return outpos+1; +} + +#ifdef CONFIG_HMP_VARIABLE_SCALE +static int hmp_period_tofrom_sysfs(int value) +{ + return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value; +} +#endif +/* max value for threshold is 1024 */ +static int hmp_theshold_from_sysfs(int value) +{ + if (value > 1024) + return -1; + return value; +} +#if defined(CONFIG_SCHED_HMP_LITTLE_PACKING) || \ + defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) +/* toggle control is only 0,1 off/on */ +static int hmp_toggle_from_sysfs(int value) +{ + if (value < 0 || value > 1) + return -1; + return value; +} +#endif +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING +/* packing value must be non-negative */ +static int hmp_packing_from_sysfs(int value) +{ + if (value < 0) + return -1; + return value; +} +#endif +static void hmp_attr_add( + const char *name, + int *value, + int (*to_sysfs)(int), + int (*from_sysfs)(int), + ssize_t (*to_sysfs_text)(char *, int), + umode_t mode) +{ + int i = 0; + while (hmp_data.attributes[i] != NULL) { + i++; + if (i >= HMP_DATA_SYSFS_MAX) + return; + } + if (mode) + hmp_data.attr[i].attr.mode = mode; + else + hmp_data.attr[i].attr.mode = 0644; + hmp_data.attr[i].show = hmp_show; + hmp_data.attr[i].store = hmp_store; + hmp_data.attr[i].attr.name = name; + hmp_data.attr[i].value = value; + hmp_data.attr[i].to_sysfs = to_sysfs; + hmp_data.attr[i].from_sysfs = from_sysfs; + hmp_data.attr[i].to_sysfs_text = to_sysfs_text; + hmp_data.attributes[i] = &hmp_data.attr[i].attr; + hmp_data.attributes[i + 1] = NULL; +} + +static int hmp_attr_init(void) +{ + int ret; + memset(&hmp_data, sizeof(hmp_data), 0); + hmp_attr_add("hmp_domains", + NULL, + NULL, + NULL, + hmp_print_domains, + 0444); + hmp_attr_add("up_threshold", + &hmp_up_threshold, + NULL, + hmp_theshold_from_sysfs, + NULL, + 0); + hmp_attr_add("down_threshold", + &hmp_down_threshold, + NULL, + hmp_theshold_from_sysfs, + NULL, + 0); +#ifdef CONFIG_HMP_VARIABLE_SCALE + /* by default load_avg_period_ms == LOAD_AVG_PERIOD + * meaning no change + */ + hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD); + hmp_attr_add("load_avg_period_ms", + &hmp_data.multiplier, + hmp_period_tofrom_sysfs, + hmp_period_tofrom_sysfs, + NULL, + 0); +#endif +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* default frequency-invariant scaling ON */ + hmp_data.freqinvar_load_scale_enabled = 1; + hmp_attr_add("frequency_invariant_load_scale", + &hmp_data.freqinvar_load_scale_enabled, + NULL, + hmp_toggle_from_sysfs, + NULL, + 0); +#endif +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING + hmp_attr_add("packing_enable", + &hmp_packing_enabled, + NULL, + hmp_toggle_from_sysfs, + NULL, + 0); + hmp_attr_add("packing_limit", + &hmp_full_threshold, + NULL, + hmp_packing_from_sysfs, + NULL, + 0); +#endif + hmp_data.attr_group.name = "hmp"; + hmp_data.attr_group.attrs = hmp_data.attributes; + ret = sysfs_create_group(kernel_kobj, + &hmp_data.attr_group); + return 0; +} +late_initcall(hmp_attr_init); +/* + * return the load of the lowest-loaded CPU in a given HMP domain + * min_cpu optionally points to an int to receive the CPU. + * affinity optionally points to a cpumask containing the + * CPUs to be considered. note: + * + min_cpu = NR_CPUS only if no CPUs are in the set of + * affinity && hmp_domain cpus + * + min_cpu will always otherwise equal one of the CPUs in + * the hmp domain + * + when more than one CPU has the same load, the one which + * is least-recently-disturbed by an HMP migration will be + * selected + * + if all CPUs are equally loaded or idle and the times are + * all the same, the first in the set will be used + * + if affinity is not set, cpu_online_mask is used + */ +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu, struct cpumask *affinity) +{ + int cpu; + int min_cpu_runnable_temp = NR_CPUS; + u64 min_target_last_migration = ULLONG_MAX; + u64 curr_last_migration; + unsigned long min_runnable_load = INT_MAX; + unsigned long contrib; + struct sched_avg *avg; + struct cpumask temp_cpumask; + /* + * only look at CPUs allowed if specified, + * otherwise look at all online CPUs in the + * right HMP domain + */ + cpumask_and(&temp_cpumask, &hmpd->cpus, affinity ? affinity : cpu_online_mask); + + for_each_cpu_mask(cpu, temp_cpumask) { + avg = &cpu_rq(cpu)->avg; + /* used for both up and down migration */ + curr_last_migration = avg->hmp_last_up_migration ? + avg->hmp_last_up_migration : avg->hmp_last_down_migration; + + contrib = avg->load_avg_ratio; + /* + * Consider a runqueue completely busy if there is any load + * on it. Definitely not the best for overall fairness, but + * does well in typical Android use cases. + */ + if (contrib) + contrib = 1023; + + if ((contrib < min_runnable_load) || + (contrib == min_runnable_load && + curr_last_migration < min_target_last_migration)) { + /* + * if the load is the same target the CPU with + * the longest time since a migration. + * This is to spread migration load between + * members of a domain more evenly when the + * domain is fully loaded + */ + min_runnable_load = contrib; + min_cpu_runnable_temp = cpu; + min_target_last_migration = curr_last_migration; + } + } + + if (min_cpu) + *min_cpu = min_cpu_runnable_temp; + + return min_runnable_load; +} + +/* + * Calculate the task starvation + * This is the ratio of actually running time vs. runnable time. + * If the two are equal the task is getting the cpu time it needs or + * it is alone on the cpu and the cpu is fully utilized. + */ +static inline unsigned int hmp_task_starvation(struct sched_entity *se) +{ + u32 starvation; + + starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD); + starvation /= (se->avg.runnable_avg_sum + 1); + + return scale_load(starvation); +} + +static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se) +{ + int min_usage; + int dest_cpu = NR_CPUS; + + if (hmp_cpu_is_slowest(cpu)) + return NR_CPUS; + + /* Is there an idle CPU in the current domain */ + min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL); + if (min_usage == 0) { + trace_sched_hmp_offload_abort(cpu, min_usage, "load"); + return NR_CPUS; + } + + /* Is the task alone on the cpu? */ + if (cpu_rq(cpu)->cfs.h_nr_running < 2) { + trace_sched_hmp_offload_abort(cpu, + cpu_rq(cpu)->cfs.h_nr_running, "nr_running"); + return NR_CPUS; + } + + /* Is the task actually starving? */ + /* >=25% ratio running/runnable = starving */ + if (hmp_task_starvation(se) > 768) { + trace_sched_hmp_offload_abort(cpu, hmp_task_starvation(se), + "starvation"); + return NR_CPUS; + } + + /* Does the slower domain have any idle CPUs? */ + min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu, + tsk_cpus_allowed(task_of(se))); + + if (min_usage == 0) { + trace_sched_hmp_offload_succeed(cpu, dest_cpu); + return dest_cpu; + } else + trace_sched_hmp_offload_abort(cpu,min_usage,"slowdomain"); + return NR_CPUS; +} +#endif /* CONFIG_SCHED_HMP */ + /* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and @@ -3339,6 +4383,19 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) if (p->nr_cpus_allowed == 1) return prev_cpu; +#ifdef CONFIG_SCHED_HMP + /* always put non-kernel forking tasks on a big domain */ + if (unlikely(sd_flag & SD_BALANCE_FORK) && hmp_task_should_forkboost(p)) { + new_cpu = hmp_select_faster_cpu(p, prev_cpu); + if (new_cpu != NR_CPUS) { + hmp_next_up_delay(&p->se, new_cpu); + return new_cpu; + } + /* failed to perform HMP fork balance, use normal balance */ + new_cpu = cpu; + } +#endif + if (sd_flag & SD_BALANCE_WAKE) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) want_affine = 1; @@ -3413,6 +4470,35 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) unlock: rcu_read_unlock(); +#ifdef CONFIG_SCHED_HMP + prev_cpu = task_cpu(p); + + if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) { + hmp_next_up_delay(&p->se, new_cpu); + trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP); + return new_cpu; + } + if (hmp_down_migration(prev_cpu, &p->se)) { +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING + new_cpu = hmp_best_little_cpu(p, prev_cpu); +#else + new_cpu = hmp_select_slower_cpu(p, prev_cpu); +#endif + /* + * we might have no suitable CPU + * in which case new_cpu == NR_CPUS + */ + if (new_cpu < NR_CPUS && new_cpu != prev_cpu) { + hmp_next_down_delay(&p->se, new_cpu); + trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP); + return new_cpu; + } + } + /* Make sure that the task stays in its previous hmp domain */ + if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus)) + return prev_cpu; +#endif + return new_cpu; } @@ -3422,6 +4508,16 @@ unlock: * load-balance). */ #ifdef CONFIG_FAIR_GROUP_SCHED + +#ifdef CONFIG_NO_HZ_COMMON +static int nohz_test_cpu(int cpu); +#else +static inline int nohz_test_cpu(int cpu) +{ + return 0; +} +#endif + /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -3441,6 +4537,25 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) * be negative here since on-rq tasks have decay-count == 0. */ if (se->avg.decay_count) { + /* + * If we migrate a sleeping task away from a CPU + * which has the tick stopped, then both the clock_task + * and decay_counter will be out of date for that CPU + * and we will not decay load correctly. + */ + if (!se->on_rq && nohz_test_cpu(task_cpu(p))) { + struct rq *rq = cpu_rq(task_cpu(p)); + unsigned long flags; + /* + * Current CPU cannot be holding rq->lock in this + * circumstance, but another might be. We must hold + * rq->lock before we go poking around in its clocks + */ + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_blocked_load(cfs_rq, 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } se->avg.decay_count = -__synchronize_entity_decay(se); atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } @@ -3946,7 +5061,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 1) task is cache cold, or * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -5232,6 +6346,16 @@ out: return ld_moved; } +#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_idle_pull(int this_cpu); +static int move_specific_task(struct lb_env *env, struct task_struct *pm); +#else +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + return 0; +} +#endif + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -5276,7 +6400,10 @@ void idle_balance(int this_cpu, struct rq *this_rq) } } rcu_read_unlock(); - +#ifdef CONFIG_SCHED_HMP + if (!pulled_task) + pulled_task = hmp_idle_pull(this_cpu); +#endif raw_spin_lock(&this_rq->lock); if (pulled_task || time_after(jiffies, this_rq->next_balance)) { @@ -5288,22 +6415,19 @@ void idle_balance(int this_cpu, struct rq *this_rq) } } -/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes - * running tasks off the busiest CPU onto idle CPUs. It requires at - * least 1 task to be running on each physical CPU where possible, and - * avoids physical / logical imbalances. - */ -static int active_load_balance_cpu_stop(void *data) +static int __do_active_load_balance_cpu_stop(void *data, bool check_sd_lb_flag) { struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p = NULL; raw_spin_lock_irq(&busiest_rq->lock); - +#ifdef CONFIG_SCHED_HMP + p = busiest_rq->migrate_task; +#endif /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) @@ -5313,6 +6437,11 @@ static int active_load_balance_cpu_stop(void *data) if (busiest_rq->nr_running <= 1) goto out_unlock; + if (!check_sd_lb_flag) { + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + } /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by @@ -5326,12 +6455,14 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + if (((check_sd_lb_flag && sd->flags & SD_LOAD_BALANCE) || + !check_sd_lb_flag) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } if (likely(sd)) { + bool success = false; struct lb_env env = { .sd = sd, .dst_cpu = target_cpu, @@ -5343,7 +6474,14 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); - if (move_one_task(&env)) + if (check_sd_lb_flag) { + if (move_one_task(&env)) + success = true; + } else { + if (move_specific_task(&env, p)) + success = true; + } + if (success) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -5351,11 +6489,24 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: + if (!check_sd_lb_flag) + put_task_struct(p); busiest_rq->active_balance = 0; raw_spin_unlock_irq(&busiest_rq->lock); return 0; } +/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ + return __do_active_load_balance_cpu_stop(data, true); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -5369,12 +6520,80 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; +/* + * nohz_test_cpu used when load tracking is enabled. FAIR_GROUP_SCHED + * dependency below may be removed when load tracking guards are + * removed. + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static int nohz_test_cpu(int cpu) +{ + return cpumask_test_cpu(cpu, nohz.idle_cpus_mask); +} +#endif + +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING +/* + * Decide if the tasks on the busy CPUs in the + * littlest domain would benefit from an idle balance + */ +static int hmp_packing_ilb_needed(int cpu, int ilb_needed) +{ + struct hmp_domain *hmp; + /* allow previous decision on non-slowest domain */ + if (!hmp_cpu_is_slowest(cpu)) + return ilb_needed; + + /* if disabled, use normal ILB behaviour */ + if (!hmp_packing_enabled) + return ilb_needed; + + hmp = hmp_cpu_domain(cpu); + for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) { + /* only idle balance if a CPU is loaded over threshold */ + if (cpu_rq(cpu)->avg.load_avg_ratio > hmp_full_threshold) + return 1; + } + return 0; +} +#endif + +DEFINE_PER_CPU(cpumask_var_t, ilb_tmpmask); + static inline int find_new_ilb(int call_cpu) { int ilb = cpumask_first(nohz.idle_cpus_mask); +#ifdef CONFIG_SCHED_HMP + int ilb_needed = 0; + int cpu; + struct cpumask* tmp = per_cpu(ilb_tmpmask, smp_processor_id()); + + /* restrict nohz balancing to occur in the same hmp domain */ + ilb = cpumask_first_and(nohz.idle_cpus_mask, + &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus); + + /* check to see if it's necessary within this domain */ + cpumask_andnot(tmp, + &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus, + nohz.idle_cpus_mask); + for_each_cpu(cpu, tmp) { + if (cpu_rq(cpu)->nr_running > 1) { + ilb_needed = 1; + break; + } + } + +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING + if (ilb < nr_cpu_ids) + ilb_needed = hmp_packing_ilb_needed(ilb, ilb_needed); +#endif + if (ilb_needed && ilb < nr_cpu_ids && idle_cpu(ilb)) + return ilb; +#else if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; +#endif return nr_cpu_ids; } @@ -5651,6 +6870,18 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) if (time_before(now, nohz.next_balance)) return 0; +#ifdef CONFIG_SCHED_HMP + /* + * Bail out if there are no nohz CPUs in our + * HMP domain, since we will move tasks between + * domains through wakeup and force balancing + * as necessary based upon task load. + */ + if (cpumask_first_and(nohz.idle_cpus_mask, + &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids) + return 0; +#endif + if (rq->nr_running >= 2) goto need_kick; @@ -5683,6 +6914,442 @@ need_kick: static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif +#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_task_eligible_for_up_migration(struct sched_entity *se) +{ + /* below hmp_up_threshold, never eligible */ + if (se->avg.load_avg_ratio < hmp_up_threshold) + return 0; + return 1; +} + +/* Check if task should migrate to a faster cpu */ +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + int temp_target_cpu; + u64 now; + + if (hmp_cpu_is_fastest(cpu)) + return 0; + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if (p->prio >= hmp_up_prio) + return 0; +#endif + if (!hmp_task_eligible_for_up_migration(se)) + return 0; + + /* Let the task load settle before doing another up migration */ + /* hack - always use clock from first online CPU */ + now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task; + if (((now - se->avg.hmp_last_up_migration) >> 10) + < hmp_next_up_threshold) + return 0; + + /* hmp_domain_min_load only returns 0 for an + * idle CPU or 1023 for any partly-busy one. + * Be explicit about requirement for an idle CPU. + */ + if (hmp_domain_min_load(hmp_faster_domain(cpu), &temp_target_cpu, + tsk_cpus_allowed(p)) == 0 && temp_target_cpu != NR_CPUS) { + if(target_cpu) + *target_cpu = temp_target_cpu; + return 1; + } + return 0; +} + +/* Check if task should migrate to a slower cpu */ +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + u64 now; + + if (hmp_cpu_is_slowest(cpu)) { +#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING + if(hmp_packing_enabled) + return 1; + else +#endif + return 0; + } + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if ((p->prio >= hmp_up_prio) && + cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p))) { + return 1; + } +#endif + + /* Let the task load settle before doing another down migration */ + /* hack - always use clock from first online CPU */ + now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task; + if (((now - se->avg.hmp_last_down_migration) >> 10) + < hmp_next_down_threshold) + return 0; + + if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p)) + && se->avg.load_avg_ratio < hmp_down_threshold) { + return 1; + } + return 0; +} + +/* + * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + * Ideally this function should be merged with can_migrate_task() to avoid + * redundant code. + */ +static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + int tsk_cache_hot = 0; + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed + */ + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + return 0; + } + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + + return 1; +} + +/* + * move_specific_task tries to move a specific task. + * Returns 1 if successful and 0 otherwise. + * Called with both runqueues locked. + */ +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + struct task_struct *p, *n; + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + continue; + + if (!hmp_can_migrate_task(p, env)) + continue; + /* Check if we found the right task */ + if (p != pm) + continue; + + move_task(p, env); + /* + * Right now, this is only the third place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; + } + return 0; +} + +/* + * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to + * migrate a specific task from one runqueue to another. + * hmp_force_up_migration uses this to push a currently running task + * off a runqueue. hmp_idle_pull uses this to pull a currently + * running task to an idle runqueue. + * Reuses __do_active_load_balance_cpu_stop to actually do the work. + */ +static int hmp_active_task_migration_cpu_stop(void *data) +{ + return __do_active_load_balance_cpu_stop(data, false); +} + +/* + * Move task in a runnable state to another CPU. + * + * Tailored on 'active_load_balance_cpu_stop' with slight + * modification to locking and pre-transfer checks. Note + * rq->lock must be held before calling. + */ +static void hmp_migrate_runnable_task(struct rq *rq) +{ + struct sched_domain *sd; + int src_cpu = cpu_of(rq); + struct rq *src_rq = rq; + int dst_cpu = rq->push_cpu; + struct rq *dst_rq = cpu_rq(dst_cpu); + struct task_struct *p = rq->migrate_task; + /* + * One last check to make sure nobody else is playing + * with the source rq. + */ + if (src_rq->active_balance) + goto out; + + if (src_rq->nr_running <= 1) + goto out; + + if (task_rq(p) != src_rq) + goto out; + /* + * Not sure if this applies here but one can never + * be too cautious + */ + BUG_ON(src_rq == dst_rq); + + double_lock_balance(src_rq, dst_rq); + + rcu_read_lock(); + for_each_domain(dst_cpu, sd) { + if (cpumask_test_cpu(src_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = dst_cpu, + .dst_rq = dst_rq, + .src_cpu = src_cpu, + .src_rq = src_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_specific_task(&env, p)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + + rcu_read_unlock(); + double_unlock_balance(src_rq, dst_rq); +out: + put_task_struct(p); +} + +static DEFINE_SPINLOCK(hmp_force_migration); + +/* + * hmp_force_up_migration checks runqueues for tasks that need to + * be actively migrated to a faster cpu. + */ +static void hmp_force_up_migration(int this_cpu) +{ + int cpu, target_cpu; + struct sched_entity *curr, *orig; + struct rq *target; + unsigned long flags; + unsigned int force, got_target; + struct task_struct *p; + + if (!spin_trylock(&hmp_force_migration)) + return; + for_each_online_cpu(cpu) { + force = 0; + got_target = 0; + target = cpu_rq(cpu); + raw_spin_lock_irqsave(&target->lock, flags); + curr = target->cfs.curr; + if (!curr || target->active_balance) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + if (!entity_is_task(curr)) { + struct cfs_rq *cfs_rq; + + cfs_rq = group_cfs_rq(curr); + while (cfs_rq) { + curr = cfs_rq->curr; + cfs_rq = group_cfs_rq(curr); + } + } + orig = curr; + curr = hmp_get_heaviest_task(curr, -1); + if (!curr) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + p = task_of(curr); + if (hmp_up_migration(cpu, &target_cpu, curr)) { + cpu_rq(target_cpu)->wake_for_idle_pull = 1; + raw_spin_unlock_irqrestore(&target->lock, flags); + spin_unlock(&hmp_force_migration); + smp_send_reschedule(target_cpu); + return; + } + if (!got_target) { + /* + * For now we just check the currently running task. + * Selecting the lightest task for offloading will + * require extensive book keeping. + */ + curr = hmp_get_lightest_task(orig, 1); + p = task_of(curr); + target->push_cpu = hmp_offload_down(cpu, curr); + if (target->push_cpu < NR_CPUS) { + get_task_struct(p); + target->migrate_task = p; + got_target = 1; + trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD); + hmp_next_down_delay(&p->se, target->push_cpu); + } + } + /* + * We have a target with no active_balance. If the task + * is not currently running move it, otherwise let the + * CPU stopper take care of it. + */ + if (got_target) { + if (!task_running(target, p)) { + trace_sched_hmp_migrate_force_running(p, 0); + hmp_migrate_runnable_task(target); + } else { + target->active_balance = 1; + force = 1; + } + } + + raw_spin_unlock_irqrestore(&target->lock, flags); + + if (force) + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + spin_unlock(&hmp_force_migration); +} +/* + * hmp_idle_pull looks at little domain runqueues to see + * if a task should be pulled. + * + * Reuses hmp_force_migration spinlock. + * + */ +static unsigned int hmp_idle_pull(int this_cpu) +{ + int cpu; + struct sched_entity *curr, *orig; + struct hmp_domain *hmp_domain = NULL; + struct rq *target = NULL, *rq; + unsigned long flags, ratio = 0; + unsigned int force = 0; + struct task_struct *p = NULL; + + if (!hmp_cpu_is_slowest(this_cpu)) + hmp_domain = hmp_slower_domain(this_cpu); + if (!hmp_domain) + return 0; + + if (!spin_trylock(&hmp_force_migration)) + return 0; + + /* first select a task */ + for_each_cpu(cpu, &hmp_domain->cpus) { + rq = cpu_rq(cpu); + raw_spin_lock_irqsave(&rq->lock, flags); + curr = rq->cfs.curr; + if (!curr) { + raw_spin_unlock_irqrestore(&rq->lock, flags); + continue; + } + if (!entity_is_task(curr)) { + struct cfs_rq *cfs_rq; + + cfs_rq = group_cfs_rq(curr); + while (cfs_rq) { + curr = cfs_rq->curr; + if (!entity_is_task(curr)) + cfs_rq = group_cfs_rq(curr); + else + cfs_rq = NULL; + } + } + orig = curr; + curr = hmp_get_heaviest_task(curr, this_cpu); + /* check if heaviest eligible task on this + * CPU is heavier than previous task + */ + if (curr && hmp_task_eligible_for_up_migration(curr) && + curr->avg.load_avg_ratio > ratio && + cpumask_test_cpu(this_cpu, + tsk_cpus_allowed(task_of(curr)))) { + p = task_of(curr); + target = rq; + ratio = curr->avg.load_avg_ratio; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + if (!p) + goto done; + + /* now we have a candidate */ + raw_spin_lock_irqsave(&target->lock, flags); + if (!target->active_balance && task_rq(p) == target) { + get_task_struct(p); + target->push_cpu = this_cpu; + target->migrate_task = p; + trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL); + hmp_next_up_delay(&p->se, target->push_cpu); + /* + * if the task isn't running move it right away. + * Otherwise setup the active_balance mechanic and let + * the CPU stopper do its job. + */ + if (!task_running(target, p)) { + trace_sched_hmp_migrate_idle_running(p, 0); + hmp_migrate_runnable_task(target); + } else { + target->active_balance = 1; + force = 1; + } + } + raw_spin_unlock_irqrestore(&target->lock, flags); + + if (force) { + /* start timer to keep us awake */ + hmp_cpu_keepalive_trigger(); + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } +done: + spin_unlock(&hmp_force_migration); + return force; +} +#else +static void hmp_force_up_migration(int this_cpu) { } +#endif /* CONFIG_SCHED_HMP */ + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). @@ -5694,6 +7361,20 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; +#ifdef CONFIG_SCHED_HMP + /* shortcut for hmp idle pull wakeups */ + if (unlikely(this_rq->wake_for_idle_pull)) { + this_rq->wake_for_idle_pull = 0; + if (hmp_idle_pull(this_cpu)) { + /* break out unless running nohz idle as well */ + if (idle != CPU_IDLE) + return; + } + } +#endif + + hmp_force_up_migration(this_cpu); + rebalance_domains(this_cpu, idle); /* @@ -5726,11 +7407,17 @@ void trigger_load_balance(struct rq *rq, int cpu) static void rq_online_fair(struct rq *rq) { +#ifdef CONFIG_SCHED_HMP + hmp_online_cpu(rq->cpu); +#endif update_sysctl(); } static void rq_offline_fair(struct rq *rq) { +#ifdef CONFIG_SCHED_HMP + hmp_offline_cpu(rq->cpu); +#endif update_sysctl(); /* Ensure any throttled groups are reachable by pick_next_task */ @@ -5778,11 +7465,15 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } + /* + * Not only the cpu but also the task_group of the parent might have + * been changed after parent->se.parent,cfs_rq were copied to + * child->se.parent,cfs_rq. So call __set_task_cpu() to make those + * of child point to valid ones. + */ + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); update_curr(cfs_rq); @@ -5832,15 +7523,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -6061,7 +7752,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - update_load_set(&se->load, 0); + /* guarantee group entities always have weight */ + update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; } @@ -6193,6 +7885,139 @@ __init void init_sched_fair_class(void) zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif + +#ifdef CONFIG_SCHED_HMP + hmp_cpu_mask_setup(); +#endif #endif /* SMP */ } + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr) +{ + u32 result = curr / max; + return result; +} + +/* Called when the CPU Frequency is changed. + * Once for each CPU. + */ +static int cpufreq_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + int cpu = freq->cpu; + struct cpufreq_extents *extents; + + if (freq->flags & CPUFREQ_CONST_LOOPS) + return NOTIFY_OK; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_OK; + + /* if dynamic load scale is disabled, set the load scale to 1.0 */ + if (!hmp_data.freqinvar_load_scale_enabled) { + freq_scale[cpu].curr_scale = 1024; + return NOTIFY_OK; + } + + extents = &freq_scale[cpu]; + if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) { + /* If our governor was recognised as a single-freq governor, + * use 1.0 + */ + extents->curr_scale = 1024; + } else { + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, freq->new); + } + + return NOTIFY_OK; +} + +/* Called when the CPUFreq governor is changed. + * Only called for the CPUs which are actually changed by the + * userspace. + */ +static int cpufreq_policy_callback(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct cpufreq_policy *policy = data; + struct cpufreq_extents *extents; + int cpu, singleFreq = 0; + static const char performance_governor[] = "performance"; + static const char powersave_governor[] = "powersave"; + + if (event == CPUFREQ_START) + return 0; + + if (event != CPUFREQ_INCOMPATIBLE) + return 0; + + /* CPUFreq governors do not accurately report the range of + * CPU Frequencies they will choose from. + * We recognise performance and powersave governors as + * single-frequency only. + */ + if (!strncmp(policy->governor->name, performance_governor, + strlen(performance_governor)) || + !strncmp(policy->governor->name, powersave_governor, + strlen(powersave_governor))) + singleFreq = 1; + + /* Make sure that all CPUs impacted by this policy are + * updated since we will only get a notification when the + * user explicitly changes the policy on a CPU. + */ + for_each_cpu(cpu, policy->cpus) { + extents = &freq_scale[cpu]; + extents->max = policy->max >> SCHED_FREQSCALE_SHIFT; + extents->min = policy->min >> SCHED_FREQSCALE_SHIFT; + if (!hmp_data.freqinvar_load_scale_enabled) { + extents->curr_scale = 1024; + } else if (singleFreq) { + extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = 1024; + } else { + extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, policy->cur); + } + } + + return 0; +} + +static struct notifier_block cpufreq_notifier = { + .notifier_call = cpufreq_callback, +}; +static struct notifier_block cpufreq_policy_notifier = { + .notifier_call = cpufreq_policy_callback, +}; + +static int __init register_sched_cpufreq_notifier(void) +{ + int ret = 0; + + /* init safe defaults since there are no policies at registration */ + for (ret = 0; ret < CONFIG_NR_CPUS; ret++) { + /* safe defaults */ + freq_scale[ret].max = 1024; + freq_scale[ret].min = 1024; + freq_scale[ret].curr_scale = 1024; + } + + pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n"); + ret = cpufreq_register_notifier(&cpufreq_policy_notifier, + CPUFREQ_POLICY_NOTIFIER); + + if (ret != -EINVAL) + ret = cpufreq_register_notifier(&cpufreq_notifier, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} + +core_initcall(register_sched_cpufreq_notifier); +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ |