diff options
author | Viresh Kumar <viresh.kumar@linaro.org> | 2013-04-02 21:59:48 +0530 |
---|---|---|
committer | Viresh Kumar <viresh.kumar@linaro.org> | 2013-04-02 21:59:48 +0530 |
commit | e801e36a4d4ffccbcbb63f2ece35d13cad4f280b (patch) | |
tree | 31676a773066bf680e58fa8d035ca88f3b64aafb | |
parent | 089165092916ec98d675e078dc5782da5d29f181 (diff) | |
parent | 63a67f13217cbb76305b79804e73e5c8b8315370 (diff) |
Merge branch 'big-LITTLE-MP-upstream-v2' into linux-linaro-MP-upstream-3.9-rc3big-LITTLE-MP-upstream-v1
-rw-r--r-- | arch/arm/include/asm/topology.h | 61 | ||||
-rw-r--r-- | arch/arm/kernel/topology.c | 9 | ||||
-rw-r--r-- | arch/ia64/include/asm/topology.h | 1 | ||||
-rw-r--r-- | arch/tile/include/asm/topology.h | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 16 | ||||
-rw-r--r-- | include/linux/topology.h | 4 | ||||
-rw-r--r-- | kernel/sched/core.c | 129 | ||||
-rw-r--r-- | kernel/sched/fair.c | 312 | ||||
-rw-r--r-- | kernel/sched/features.h | 2 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 7 | ||||
-rw-r--r-- | kernel/sched/sched.h | 43 | ||||
-rw-r--r-- | linaro/configs/big-LITTLE-MP.conf | 5 |
12 files changed, 509 insertions, 81 deletions
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 58b8b84adcd..d394578f7a8 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -34,6 +34,67 @@ static inline void store_cpu_topology(unsigned int cpuid) { } #endif +#ifdef CONFIG_SCHED_MC +/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ +#ifndef SD_MC_INIT +#define SD_MC_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 16, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .wake_idx = 0, \ + .forkexec_idx = 0, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_BALANCE_WAKE \ + | 1*SD_WAKE_AFFINE \ + | 0*SD_SHARE_CPUPOWER \ + | 1*SD_SHARE_PKG_RESOURCES \ + | arch_sd_local_flags(SD_SHARE_PKG_RESOURCES)\ + | 0*SD_SERIALIZE \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ +} +#endif +#endif /* CONFIG_SCHED_MC */ + +/* Common values for CPUs */ +#ifndef SD_CPU_INIT +#define SD_CPU_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 16, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .idle_idx = 1, \ + .newidle_idx = 0, \ + .wake_idx = 0, \ + .forkexec_idx = 0, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_BALANCE_WAKE \ + | 1*SD_WAKE_AFFINE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_SHARE_PKG_RESOURCES \ + | arch_sd_local_flags(0) \ + | 0*SD_SERIALIZE \ + | 1*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ +} +#endif + #include <asm-generic/topology.h> #endif /* _ASM_ARM_TOPOLOGY_H */ diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 79282ebcd93..f89a4a2b6ed 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -201,6 +201,15 @@ static inline void update_cpu_power(unsigned int cpuid, unsigned int mpidr) {} */ struct cputopo_arm cpu_topology[NR_CPUS]; +int arch_sd_local_flags(int level) +{ + /* Powergate at threading level doesn't make sense */ + if (level & SD_SHARE_CPUPOWER) + return 1*SD_SHARE_POWERDOMAIN; + + return 0*SD_SHARE_POWERDOMAIN; +} + const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_sibling; diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index a2496e449b7..6d0b61741c1 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -65,6 +65,7 @@ void build_cpu_to_node_map(void); | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_WAKE_AFFINE, \ + | arch_sd_local_flags(0)\ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index d5e86c9f74f..adc871020d4 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h @@ -71,6 +71,7 @@ static inline const struct cpumask *cpumask_of_node(int node) | 0*SD_WAKE_AFFINE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ + | arch_sd_local_flags(0) \ | 0*SD_SERIALIZE \ , \ .last_balance = jiffies, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 4afded2be58..43d10ee10cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -809,6 +809,7 @@ enum cpu_idle_type { #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ @@ -959,6 +960,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; + unsigned long flags; + struct rcu_head rcu; /* used during destruction */ +}; + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { return to_cpumask(sd->span); @@ -1034,6 +1041,7 @@ struct sched_domain; #else #define ENQUEUE_WAKING 0 #endif +#define ENQUEUE_NEWTASK 8 #define DEQUEUE_SLEEP 1 @@ -1160,13 +1168,7 @@ struct sched_entity { struct cfs_rq *my_q; #endif -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) - /* Per-entity load-tracking */ +#ifdef CONFIG_SMP struct sched_avg avg; #endif }; diff --git a/include/linux/topology.h b/include/linux/topology.h index d3cf0d6e771..3eab2933c8e 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -99,6 +99,8 @@ int arch_update_cpu_topology(void); | 1*SD_WAKE_AFFINE \ | 1*SD_SHARE_CPUPOWER \ | 1*SD_SHARE_PKG_RESOURCES \ + | arch_sd_local_flags(SD_SHARE_CPUPOWER|\ + SD_SHARE_PKG_RESOURCES) \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ | arch_sd_sibling_asym_packing() \ @@ -131,6 +133,7 @@ int arch_update_cpu_topology(void); | 1*SD_WAKE_AFFINE \ | 0*SD_SHARE_CPUPOWER \ | 1*SD_SHARE_PKG_RESOURCES \ + | arch_sd_local_flags(SD_SHARE_PKG_RESOURCES)\ | 0*SD_SERIALIZE \ , \ .last_balance = jiffies, \ @@ -161,6 +164,7 @@ int arch_update_cpu_topology(void); | 1*SD_WAKE_AFFINE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ + | arch_sd_local_flags(0) \ | 0*SD_SERIALIZE \ | 1*SD_PREFER_SIBLING \ , \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 870c3242855..06933795dc2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1561,14 +1561,11 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; + p->se.avg.decay_count = 0; + p->se.avg.load_avg_contrib = 0; #endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -1714,7 +1711,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NEWTASK); p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); @@ -2533,7 +2530,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; + unsigned long load = (unsigned long)this_rq->cfs.runnable_load_avg; unsigned long pending_updates; /* @@ -2583,7 +2580,7 @@ static void update_cpu_load_active(struct rq *this_rq) * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); + __update_cpu_load(this_rq, this_rq->cfs.runnable_load_avg, 1); calc_load_account_active(this_rq); } @@ -2689,8 +2686,8 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); + update_cpu_load_active(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -5602,6 +5599,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) +{ + if (!sd_rq) + return; + + destroy_sched_domains(sd_rq->sd, cpu); + kfree_rcu(sd_rq, rcu); +} + /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this @@ -5632,10 +5638,23 @@ static void update_top_cache_domain(int cpu) * hold the hotplug lock. */ static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, + int cpu) { struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; + struct sched_domain_rq *tmp_rq; + struct sched_domain *tmp, *sd = NULL; + + /* + * If we don't have any sched_domain and associated object, we can + * directly jump to the attach sequence otherwise we try to degenerate + * the sched_domain + */ + if (!sd_rq) + goto attach; + + /* Get a pointer to the 1st sched_domain */ + sd = sd_rq->sd; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -5658,15 +5677,19 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) destroy_sched_domain(tmp, cpu); if (sd) sd->child = NULL; + /* update sched_domain_rq */ + sd_rq->sd = sd; } +attach: sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); - tmp = rq->sd; - rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp, cpu); + tmp_rq = rq->sd_rq; + rcu_assign_pointer(rq->sd_rq, sd_rq); + destroy_sched_domain_rq(tmp_rq, cpu); + update_packing_domain(cpu); update_top_cache_domain(cpu); } @@ -5695,12 +5718,14 @@ struct sd_data { }; struct s_data { + struct sched_domain_rq ** __percpu sd_rq; struct sched_domain ** __percpu sd; struct root_domain *rd; }; enum s_alloc { sa_rootdomain, + sa_sd_rq, sa_sd, sa_sd_storage, sa_none, @@ -5935,7 +5960,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) return; update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); + atomic_set(&sg->sgp->nr_busy_cpus, 0); } int __weak arch_sd_sibling_asym_packing(void) @@ -5943,6 +5968,11 @@ int __weak arch_sd_sibling_asym_packing(void) return 0*SD_ASYM_PACKING; } +int __weak arch_sd_local_flags(int level) +{ + return 1*SD_SHARE_POWERDOMAIN; +} + /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -6011,6 +6041,8 @@ static void set_domain_attribute(struct sched_domain *sd, static void __sdt_free(const struct cpumask *cpu_map); static int __sdt_alloc(const struct cpumask *cpu_map); +static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d); +static int __sdrq_alloc(const struct cpumask *cpu_map, struct s_data *d); static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) @@ -6019,6 +6051,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_rootdomain: if (!atomic_read(&d->rd->refcount)) free_rootdomain(&d->rd->rcu); /* fall through */ + case sa_sd_rq: + __sdrq_free(cpu_map, d); /* fall through */ + free_percpu(d->sd_rq); /* fall through */ case sa_sd: free_percpu(d->sd); /* fall through */ case sa_sd_storage: @@ -6038,9 +6073,14 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, d->sd = alloc_percpu(struct sched_domain *); if (!d->sd) return sa_sd_storage; + d->sd_rq = alloc_percpu(struct sched_domain_rq *); + if (!d->sd_rq) + return sa_sd; + if (__sdrq_alloc(cpu_map, d)) + return sa_sd_rq; d->rd = alloc_rootdomain(); if (!d->rd) - return sa_sd; + return sa_sd_rq; return sa_rootdomain; } @@ -6132,6 +6172,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_WAKE_AFFINE | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES + | 1*SD_SHARE_POWERDOMAIN | 1*SD_SERIALIZE | 0*SD_PREFER_SIBLING | sd_local_flags(level) @@ -6466,6 +6507,46 @@ static void __sdt_free(const struct cpumask *cpu_map) } } +static int __sdrq_alloc(const struct cpumask *cpu_map, struct s_data *d) +{ + int j; + + for_each_cpu(j, cpu_map) { + struct sched_domain_rq *sd_rq; + + sd_rq = kzalloc_node(sizeof(struct sched_domain_rq), + GFP_KERNEL, cpu_to_node(j)); + if (!sd_rq) + return -ENOMEM; + + *per_cpu_ptr(d->sd_rq, j) = sd_rq; + } + + return 0; +} + +static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d) +{ + int j; + + for_each_cpu(j, cpu_map) + if (*per_cpu_ptr(d->sd_rq, j)) + kfree(*per_cpu_ptr(d->sd_rq, j)); +} + +static void build_sched_domain_rq(struct s_data *d, int cpu) +{ + struct sched_domain_rq *sd_rq; + struct sched_domain *sd; + + /* Attach sched_domain to sched_domain_rq */ + sd = *per_cpu_ptr(d->sd, cpu); + sd_rq = *per_cpu_ptr(d->sd_rq, cpu); + sd_rq->sd = sd; + /* Init flags */ + set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)); +} + struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, @@ -6495,6 +6576,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { enum s_alloc alloc_state = sa_none; + struct sched_domain_rq *sd_rq; struct sched_domain *sd; struct s_data d; int i, ret = -ENOMEM; @@ -6547,11 +6629,18 @@ static int build_sched_domains(const struct cpumask *cpu_map, } } + /* Init objects that must follow the sched_domain lifecycle */ + for_each_cpu(i, cpu_map) { + build_sched_domain_rq(&d, i); + } + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - sd = *per_cpu_ptr(d.sd, i); - cpu_attach_domain(sd, d.rd, i); + sd_rq = *per_cpu_ptr(d.sd_rq, i); + cpu_attach_domain(sd_rq, d.rd, i); + /* claim allocation of sched_domain_rq object */ + *per_cpu_ptr(d.sd_rq, i) = NULL; } rcu_read_unlock(); @@ -6982,7 +7071,7 @@ void __init sched_init(void) rq->last_load_update_tick = jiffies; #ifdef CONFIG_SMP - rq->sd = NULL; + rq->sd_rq = NULL; rq->rd = NULL; rq->cpu_power = SCHED_POWER_SCALE; rq->post_schedule = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a33e5986fc..d66fa87ebdb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -160,6 +160,76 @@ void sched_init_granularity(void) update_sysctl(); } + +#ifdef CONFIG_SMP +/* + * Save the id of the optimal CPU that should be used to pack small tasks + * The value -1 is used when no buddy has been found + */ +DEFINE_PER_CPU(int, sd_pack_buddy); + +/* + * Look for the best buddy CPU that can be used to pack small tasks + * We make the assumption that it doesn't wort to pack on CPU that share the + * same powerline. We look for the 1st sched_domain without the + * SD_SHARE_POWERDOMAIN flag. Then we look for the sched_group with the lowest + * power per core based on the assumption that their power efficiency is + * better + */ +void update_packing_domain(int cpu) +{ + struct sched_domain *sd; + int id = -1; + + sd = highest_flag_domain(cpu, SD_SHARE_POWERDOMAIN); + if (!sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + else + sd = sd->parent; + + while (sd && (sd->flags & SD_LOAD_BALANCE) + && !(sd->flags & SD_SHARE_POWERDOMAIN)) { + struct sched_group *sg = sd->groups; + struct sched_group *pack = sg; + struct sched_group *tmp; + + /* + * The sched_domain of a CPU points on the local sched_group + * and the 1st CPU of this local group is a good candidate + */ + id = cpumask_first(sched_group_cpus(pack)); + + /* loop the sched groups to find the best one */ + for (tmp = sg->next; tmp != sg; tmp = tmp->next) { + if (tmp->sgp->power * pack->group_weight > + pack->sgp->power * tmp->group_weight) + continue; + + if ((tmp->sgp->power * pack->group_weight == + pack->sgp->power * tmp->group_weight) + && (cpumask_first(sched_group_cpus(tmp)) >= id)) + continue; + + /* we have found a better group */ + pack = tmp; + + /* Take the 1st CPU of the new group */ + id = cpumask_first(sched_group_cpus(pack)); + } + + /* Look for another CPU than itself */ + if (id != cpu) + break; + + sd = sd->parent; + } + + pr_debug("CPU%d packing on CPU%d\n", cpu, id); + per_cpu(sd_pack_buddy, cpu) = id; +} + +#endif /* CONFIG_SMP */ + #if BITS_PER_LONG == 32 # define WMULT_CONST (~0UL) #else @@ -1109,8 +1179,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -1503,8 +1572,9 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable) /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, - int wakeup) + int flags) { + int wakeup = flags & ENQUEUE_WAKEUP; /* * We track migrations using entity decay_count <= 0, on a wake-up * migration we use a negative decay count to track the remote decays @@ -1538,6 +1608,12 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, update_entity_load_avg(se, 0); } + /* + * set the initial load avg of new task same as its load + * in order to avoid brust fork make few cpu too heavier + */ + if (flags & ENQUEUE_NEWTASK) + se->avg.load_avg_contrib = se->load.weight; cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); @@ -1562,6 +1638,16 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit(int this_cpu, struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} #else static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} @@ -1699,7 +1785,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + enqueue_entity_load_avg(cfs_rq, se, flags); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -2897,7 +2983,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->load.weight; + return (unsigned long)cpu_rq(cpu)->cfs.runnable_load_avg; } /* @@ -2944,7 +3030,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) - return rq->load.weight / nr_running; + return (unsigned long)rq->cfs.runnable_load_avg / nr_running; return 0; } @@ -2973,7 +3059,8 @@ static void task_waking_fair(struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED /* - * effective_load() calculates the load change as seen from the root_task_group + * effective_load() calculates the runnable load average change as seen from + * the root_task_group * * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one @@ -3021,6 +3108,9 @@ static void task_waking_fair(struct task_struct *p) * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - * 4/7) times the weight of the group. + * + * After get effective_load of the load moving, will engaged the sched entity's + * runnable avg. */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { @@ -3095,6 +3185,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) struct task_group *tg; unsigned long weight; int balanced; + int runnable_avg; idx = sd->wake_idx; this_cpu = smp_processor_id(); @@ -3110,13 +3201,19 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) if (sync) { tg = task_group(current); weight = current->se.load.weight; + runnable_avg = current->se.avg.runnable_avg_sum * NICE_0_LOAD + / (current->se.avg.runnable_avg_period + 1); - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); + this_load += effective_load(tg, this_cpu, -weight, -weight) + * runnable_avg >> NICE_0_SHIFT; + load += effective_load(tg, prev_cpu, 0, -weight) + * runnable_avg >> NICE_0_SHIFT; } tg = task_group(p); weight = p->se.load.weight; + runnable_avg = p->se.avg.runnable_avg_sum * NICE_0_LOAD + / (p->se.avg.runnable_avg_period + 1); /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -3128,16 +3225,18 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * task to be woken on this_cpu. */ if (this_load > 0) { - s64 this_eff_load, prev_eff_load; + s64 this_eff_load, prev_eff_load, tmp_eff_load; this_eff_load = 100; this_eff_load *= power_of(prev_cpu); - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); + tmp_eff_load = effective_load(tg, this_cpu, weight, weight) + * runnable_avg >> NICE_0_SHIFT; + this_eff_load *= this_load + tmp_eff_load; prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= power_of(this_cpu); - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + prev_eff_load *= load + (effective_load(tg, prev_cpu, 0, weight) + * runnable_avg >> NICE_0_SHIFT); balanced = this_eff_load <= prev_eff_load; } else @@ -3292,6 +3391,50 @@ done: return target; } +static bool is_buddy_busy(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u32 sum = rq->avg.runnable_avg_sum; + u32 period = rq->avg.runnable_avg_period; + + sum = min(sum, period); + + /* + * A busy buddy is a CPU with a high load or a small load with a lot of + * running tasks. + */ + return (sum > (period / (rq->nr_running + 2))); +} + +static bool is_light_task(struct task_struct *p) +{ + /* A light task runs less than 20% in average */ + return ((p->se.avg.runnable_avg_sum * 5) < + (p->se.avg.runnable_avg_period)); +} + +static int check_pack_buddy(int cpu, struct task_struct *p) +{ + int buddy = per_cpu(sd_pack_buddy, cpu); + + /* No pack buddy for this CPU */ + if (buddy == -1) + return false; + + /* buddy is not an allowed CPU */ + if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p))) + return false; + + /* + * If the task is a small one and the buddy is not overloaded, + * we use buddy cpu + */ + if (!is_light_task(p) || is_buddy_busy(buddy)) + return false; + + return true; +} + /* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and @@ -3320,6 +3463,10 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) want_affine = 1; new_cpu = prev_cpu; + + /* We pack only at wake up and not new task */ + if (check_pack_buddy(new_cpu, p)) + return per_cpu(sd_pack_buddy, new_cpu); } rcu_read_lock(); @@ -3394,12 +3541,6 @@ unlock: } /* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED -/* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the * previous cpu. However, the caller only guarantees p->pi_lock is held; no @@ -3422,7 +3563,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } -#endif #endif /* CONFIG_SMP */ static unsigned long @@ -3970,6 +4110,15 @@ static unsigned long task_h_load(struct task_struct *p); static const unsigned int sched_nr_migrate_break = 32; +static unsigned long task_h_load_avg(struct task_struct *p) +{ + u32 period = p->se.avg.runnable_avg_period; + if (!period) + return 0; + + return task_h_load(p) * p->se.avg.runnable_avg_sum / period; +} + /* * move_tasks tries to move up to imbalance weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". @@ -4005,12 +4154,13 @@ static int move_tasks(struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) goto next; - load = task_h_load(p); + load = task_h_load_avg(p); - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) + if (sched_feat(LB_MIN) && load < 204 && !env->sd->nr_balance_failed) goto next; - if ((load / 2) > env->imbalance) + if ((load / 2) > env->imbalance && + (env->idle != CPU_IDLE && env->idle != CPU_NEWLY_IDLE)) goto next; if (!can_migrate_task(p, env)) @@ -4415,7 +4565,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; - unsigned int balance_cpu = -1, first_idle_cpu = 0; + unsigned int balance_cpu = -1, first_idle_cpu = 0, overloaded_cpu = 0; unsigned long avg_load_per_task = 0; int i; @@ -4453,6 +4603,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, max_nr_running = nr_running; if (min_nr_running > nr_running) min_nr_running = nr_running; + + if ((load > rq->cpu_power) + && ((rq->cpu_power*env->sd->imbalance_pct) < (env->dst_rq->cpu_power*100)) + && (load > target_load(env->dst_cpu, load_idx))) + overloaded_cpu = 1; } sgs->group_load += load; @@ -4498,6 +4653,22 @@ static inline void update_sg_lb_stats(struct lb_env *env, (max_nr_running - min_nr_running) > 1) sgs->group_imb = 1; + /* + * The load contrib of a CPU exceeds its capacity, we should try to + * find a better CPU with more capacity + */ + if (overloaded_cpu) + sgs->group_imb = 1; + + /* + * When idle balancing pull tasks if more than one task per cpu + * in group + */ + if (env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) { + if (group->group_weight < sgs->sum_nr_running) + sgs->group_imb = 1; + } + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); if (!sgs->group_capacity) @@ -4725,8 +4896,13 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_POWER_SCALE; - /* Move if we gain throughput */ - if (pwr_move > pwr_now) + /* + * Move if we gain throughput, or if we have cpus idling while others + * are running more than one task. + */ + if ((pwr_move > pwr_now) || + (sds->busiest_group_weight < sds->busiest_nr_running && + (env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE))) env->imbalance = sds->busiest_load_per_task; } @@ -4911,6 +5087,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; + struct rq *overloaded = NULL, *dst_rq = cpu_rq(env->dst_cpu); unsigned long max_load = 0; int i; @@ -4930,6 +5107,17 @@ static struct rq *find_busiest_queue(struct lb_env *env, wl = weighted_cpuload(i); /* + * If the task requires more power than the current CPU + * capacity and the dst_cpu has more capacity, keep the + * dst_cpu in mind + */ + if ((rq->nr_running == 1) + && (rq->cfs.runnable_load_avg > rq->cpu_power) + && (rq->cfs.runnable_load_avg > dst_rq->cfs.runnable_load_avg) + && ((rq->cpu_power*env->sd->imbalance_pct) < (dst_rq->cpu_power*100))) + overloaded = rq; + + /* * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu power. */ @@ -4950,6 +5138,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, } } + if (!busiest) + busiest = overloaded; + return busiest; } @@ -4977,6 +5168,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((power_of(env->src_cpu)*sd->imbalance_pct) < (power_of(env->dst_cpu)*100)) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -5035,6 +5229,10 @@ redo: ld_moved = 0; lb_iterations = 1; + + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -5043,8 +5241,6 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); update_h_load(env.src_cpu); @@ -5345,7 +5541,25 @@ static struct { static inline int find_new_ilb(int call_cpu) { + struct sched_domain *sd; int ilb = cpumask_first(nohz.idle_cpus_mask); + int buddy = per_cpu(sd_pack_buddy, call_cpu); + + /* + * If we have a pack buddy CPU, we try to run load balance on a CPU + * that is close to the buddy. + */ + if (buddy != -1) + for_each_domain(buddy, sd) { + if (sd->flags & SD_SHARE_CPUPOWER) + continue; + + ilb = cpumask_first_and(sched_domain_span(sd), + nohz.idle_cpus_mask); + + if (ilb < nr_cpu_ids) + break; + } if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; @@ -5392,31 +5606,39 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { + struct sched_domain_rq *sd_rq; struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd_rq = get_sched_domain_rq(cpu); + + if (!sd_rq || !test_bit(NOHZ_IDLE, sched_rq_flags(sd_rq))) + goto unlock; + clear_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)); + + for_each_domain_from_rq(sd_rq, sd) atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } void set_cpu_sd_state_idle(void) { + struct sched_domain_rq *sd_rq; struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd_rq = get_sched_domain_rq(cpu); + + if (!sd_rq || test_bit(NOHZ_IDLE, sched_rq_flags(sd_rq))) + goto unlock; + set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)); + + for_each_domain_from_rq(sd_rq, sd) atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5621,6 +5843,10 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) if (rq->nr_running >= 2) goto need_kick; + /* load contrib is higher than cpu capacity */ + if (rq->cfs.runnable_load_avg > rq->cpu_power) + goto need_kick; + rcu_read_lock(); for_each_domain(cpu, sd) { struct sched_group *sg = sd->groups; @@ -5673,7 +5899,12 @@ static void run_rebalance_domains(struct softirq_action *h) static inline int on_null_domain(int cpu) { - return !rcu_dereference_sched(cpu_rq(cpu)->sd); + struct sched_domain_rq *sd_rq = + rcu_dereference_sched(cpu_rq(cpu)->sd_rq); + struct sched_domain *sd = NULL; + if (sd_rq) + sd = sd_rq->sd; + return !sd; } /* @@ -6114,9 +6345,8 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, -#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, -#endif + .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395..4760a2d2da4 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -65,7 +65,7 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) -SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(LB_MIN, true) /* * Apply the automatic NUMA scheduling policy. Enabled automatically diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae..27cd379a754 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,12 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + /* Update rq's load with elapsed idle time */ + idle_exit(smp_processor_id(), rq); +} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -86,6 +92,7 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfdf469..01833d81684 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,12 +227,6 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -242,8 +236,7 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -/* These always depend on CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib; u64 tg_load_contrib; @@ -417,7 +410,7 @@ struct rq { #ifdef CONFIG_SMP struct root_domain *rd; - struct sched_domain *sd; + struct sched_domain_rq *sd_rq; unsigned long cpu_power; @@ -505,21 +498,37 @@ DECLARE_PER_CPU(struct rq, runqueues); #ifdef CONFIG_SMP -#define rcu_dereference_check_sched_domain(p) \ +#define rcu_dereference_check_sched_domain_rq(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) +#define get_sched_domain_rq(cpu) \ + rcu_dereference_check_sched_domain_rq(cpu_rq(cpu)->sd_rq) + +#define rcu_dereference_check_sched_domain(cpu) ({ \ + struct sched_domain_rq *__sd_rq = get_sched_domain_rq(cpu); \ + struct sched_domain *__sd = NULL; \ + if (__sd_rq) \ + __sd = __sd_rq->sd; \ + __sd; \ +}) + +#define sched_rq_flags(sd_rq) (&sd_rq->flags) + /* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * The domain tree (rq->sd_rq) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. * * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + for (__sd = rcu_dereference_check_sched_domain(cpu); \ __sd; __sd = __sd->parent) +#define for_each_domain_from_rq(sd_rq, __sd) \ + for (__sd = sd_rq->sd; __sd; __sd = __sd->parent) + #define for_each_lower_domain(sd) for (; sd; sd = sd->child) /** @@ -879,6 +888,8 @@ extern const struct sched_class idle_sched_class; extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); +extern void idle_exit(int this_cpu, struct rq *this_rq); +extern void update_packing_domain(int cpu); #else /* CONFIG_SMP */ @@ -886,6 +897,14 @@ static inline void idle_balance(int cpu, struct rq *rq) { } +static inline void idle_exit(int this_cpu, struct rq *this_rq) +{ +} + +static inline void update_packing_domain(int cpu) +{ +} + #endif extern void sysrq_sched_debug_show(void); diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf new file mode 100644 index 00000000000..80bf45fa6e2 --- /dev/null +++ b/linaro/configs/big-LITTLE-MP.conf @@ -0,0 +1,5 @@ +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_NO_HZ=y +CONFIG_SCHED_MC=y |