From e4ad8f3d200098556152d83073e559254f6e9770 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 3 Dec 2012 17:30:39 +0800 Subject: sched: give initial value for runnable avg of sched entities. We need initialize the se.avg.{decay_count, load_avg_contrib} to zero after a new task forked. Otherwise random values of above variables cause mess when do new task enqueue: enqueue_task_fair enqueue_entity enqueue_entity_load_avg Signed-off-by: Alex Shi --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393..8c1c9790c00 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1569,6 +1569,8 @@ static void __sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; + p->se.avg.decay_count = 0; + p->se.avg.load_avg_contrib = 0; #endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); -- cgit v1.2.3 From d910a638f41c3d32bf32c948636752f4a1075d66 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 24 Dec 2012 09:56:41 +0800 Subject: sched: set initial load avg of new forked task New task has no runnable sum at its first runnable time, so its runnable load is zero. That makes burst forking balancing just select few idle cpus to assign tasks if we engage runnable load in balancing. Set initial load avg of new forked task as its load weight to resolve this issue. Signed-off-by: Alex Shi Reviewed-by: Preeti U Murthy --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 +- kernel/sched/fair.c | 11 +++++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbf..b13ab15932d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1034,6 +1034,7 @@ struct sched_domain; #else #define ENQUEUE_WAKING 0 #endif +#define ENQUEUE_NEWTASK 8 #define DEQUEUE_SLEEP 1 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8c1c9790c00..876230c6479 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1716,7 +1716,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NEWTASK); p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a33e5986fc..211d94b8543 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1503,8 +1503,9 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable) /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, - int wakeup) + int flags) { + int wakeup = flags & ENQUEUE_WAKEUP; /* * We track migrations using entity decay_count <= 0, on a wake-up * migration we use a negative decay count to track the remote decays @@ -1538,6 +1539,12 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, update_entity_load_avg(se, 0); } + /* + * set the initial load avg of new task same as its load + * in order to avoid brust fork make few cpu too heavier + */ + if (flags & ENQUEUE_NEWTASK) + se->avg.load_avg_contrib = se->load.weight; cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); @@ -1699,7 +1706,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + enqueue_entity_load_avg(cfs_rq, se, flags); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); -- cgit v1.2.3 From 0757750816d6506d94c754990618c0f68911c592 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 23 Jan 2013 18:03:50 +0800 Subject: Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking" Remove CONFIG_FAIR_GROUP_SCHED that covers the runnable info, then we can use runnable load variables. Signed-off-by: Alex Shi --- include/linux/sched.h | 8 +------- kernel/sched/core.c | 7 +------ kernel/sched/fair.c | 13 ++----------- kernel/sched/sched.h | 9 +-------- 4 files changed, 5 insertions(+), 32 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b13ab15932d..1c0dbebe481 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1161,13 +1161,7 @@ struct sched_entity { struct cfs_rq *my_q; #endif -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) - /* Per-entity load-tracking */ +#ifdef CONFIG_SMP struct sched_avg avg; #endif }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 876230c6479..7d995445955 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1561,12 +1561,7 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; p->se.avg.decay_count = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 211d94b8543..ff8fea91045 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1109,8 +1109,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -3400,12 +3399,6 @@ unlock: return new_cpu; } -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -3429,7 +3422,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } -#endif #endif /* CONFIG_SMP */ static unsigned long @@ -6121,9 +6113,8 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, -#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, -#endif + .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfdf469..7f36024f5cb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,12 +227,6 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -242,8 +236,7 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -/* These always depend on CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib; u64 tg_load_contrib; -- cgit v1.2.3 From 530503a01ada45e028d532016ecc6b8d575e601a Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 3 Jan 2013 01:27:32 +0800 Subject: sched: update cpu load after task_tick. To get the latest runnable info, we need do this cpuload update after task_tick. Signed-off-by: Alex Shi --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d995445955..695c9f4fa39 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2686,8 +2686,8 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); + update_cpu_load_active(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); -- cgit v1.2.3 From 93706cbbec3dbbadc6cb0c144ce3ced4aa5de98b Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 17 Nov 2012 13:56:11 +0800 Subject: sched: compute runnable load avg in cpu_load and cpu_avg_load_per_task They are the base values in load balance, update them with rq runnable load average, then the load balance will consider runnable load avg naturally. Signed-off-by: Alex Shi --- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 695c9f4fa39..2a593873a2d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2530,7 +2530,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; + unsigned long load = (unsigned long)this_rq->cfs.runnable_load_avg; unsigned long pending_updates; /* @@ -2580,7 +2580,7 @@ static void update_cpu_load_active(struct rq *this_rq) * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); + __update_cpu_load(this_rq, this_rq->cfs.runnable_load_avg, 1); calc_load_account_active(this_rq); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff8fea91045..da77cb51cab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2903,7 +2903,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->load.weight; + return (unsigned long)cpu_rq(cpu)->cfs.runnable_load_avg; } /* @@ -2950,7 +2950,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) - return rq->load.weight / nr_running; + return (unsigned long)rq->cfs.runnable_load_avg / nr_running; return 0; } -- cgit v1.2.3 From 12e191a189226d7a15f859f869b525b6dbd0dd15 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 3 Dec 2012 23:00:53 +0800 Subject: sched: consider runnable load average in move_tasks Except using runnable load average in background, move_tasks is also the key functions in load balance. We need consider the runnable load average in it in order to the apple to apple load comparison. Signed-off-by: Alex Shi --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da77cb51cab..96a4ddce4b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3969,6 +3969,15 @@ static unsigned long task_h_load(struct task_struct *p); static const unsigned int sched_nr_migrate_break = 32; +static unsigned long task_h_load_avg(struct task_struct *p) +{ + u32 period = p->se.avg.runnable_avg_period; + if (!period) + return 0; + + return task_h_load(p) * p->se.avg.runnable_avg_sum / period; +} + /* * move_tasks tries to move up to imbalance weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". @@ -4004,7 +4013,7 @@ static int move_tasks(struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) goto next; - load = task_h_load(p); + load = task_h_load_avg(p); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; -- cgit v1.2.3 From 56f2c5357692853ea09b7172fa203b9ede0c99fa Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 19 Dec 2012 17:11:14 +0800 Subject: sched: consider runnable load average in effective_load effective_load calculates the load change as seen from the root_task_group. It needs to engage the runnable average of changed task. Thanks for Morten Rasmussen's reminder of this. Signed-off-by: Alex Shi --- kernel/sched/fair.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96a4ddce4b8..cf97b67fded 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2979,7 +2979,8 @@ static void task_waking_fair(struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED /* - * effective_load() calculates the load change as seen from the root_task_group + * effective_load() calculates the runnable load average change as seen from + * the root_task_group * * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one @@ -3027,6 +3028,9 @@ static void task_waking_fair(struct task_struct *p) * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - * 4/7) times the weight of the group. + * + * After get effective_load of the load moving, will engaged the sched entity's + * runnable avg. */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { @@ -3101,6 +3105,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) struct task_group *tg; unsigned long weight; int balanced; + int runnable_avg; idx = sd->wake_idx; this_cpu = smp_processor_id(); @@ -3116,13 +3121,19 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) if (sync) { tg = task_group(current); weight = current->se.load.weight; + runnable_avg = current->se.avg.runnable_avg_sum * NICE_0_LOAD + / (current->se.avg.runnable_avg_period + 1); - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); + this_load += effective_load(tg, this_cpu, -weight, -weight) + * runnable_avg >> NICE_0_SHIFT; + load += effective_load(tg, prev_cpu, 0, -weight) + * runnable_avg >> NICE_0_SHIFT; } tg = task_group(p); weight = p->se.load.weight; + runnable_avg = p->se.avg.runnable_avg_sum * NICE_0_LOAD + / (p->se.avg.runnable_avg_period + 1); /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -3134,16 +3145,18 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * task to be woken on this_cpu. */ if (this_load > 0) { - s64 this_eff_load, prev_eff_load; + s64 this_eff_load, prev_eff_load, tmp_eff_load; this_eff_load = 100; this_eff_load *= power_of(prev_cpu); - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); + tmp_eff_load = effective_load(tg, this_cpu, weight, weight) + * runnable_avg >> NICE_0_SHIFT; + this_eff_load *= this_load + tmp_eff_load; prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= power_of(this_cpu); - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + prev_eff_load *= load + (effective_load(tg, prev_cpu, 0, weight) + * runnable_avg >> NICE_0_SHIFT); balanced = this_eff_load <= prev_eff_load; } else -- cgit v1.2.3