aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c1130
1 files changed, 968 insertions, 162 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b99..5eea8707234 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
#include <trace/events/sched.h>
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update);
+
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
+ /* We should have no load, but we need to update last_decay. */
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -776,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq;
+
+ if (!p->mm) /* for example, ksmd faulting in a user's mm */
+ return;
+ seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+
+ /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages, bool migrated)
+{
+ struct task_struct *p = current;
+
+ if (!sched_feat_numa(NUMA))
+ return;
+
+ /* FIXME: Allocate task-specific structure for placement policy here */
+
+ /*
+ * If pages are properly placed (did not migrate) then scan slower.
+ * This is reset periodically in case of phase changes
+ */
+ if (!migrated)
+ p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+ p->numa_scan_period + jiffies_to_msecs(10));
+
+ task_numa_placement(p);
+}
+
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ p->mm->numa_scan_offset = 0;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ unsigned long start, end;
+ long pages;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * We do not care about task placement until a task runs on a node
+ * other than the first one used by the address space. This is
+ * largely because migrations are driven by what CPU the task
+ * is running on. If it's never scheduled on another node, it'll
+ * not migrate so why bother trapping the fault.
+ */
+ if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+ mm->first_nid = numa_node_id();
+ if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+ /* Are we running on a new node yet? */
+ if (numa_node_id() == mm->first_nid &&
+ !sched_feat_numa(NUMA_FORCE))
+ return;
+
+ mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+ }
+
+ /*
+ * Reset the scan period if enough time has gone by. Objective is that
+ * scanning will be reduced if pages are properly placed. As tasks
+ * can enter different phases this needs to be re-examined. Lacking
+ * proper tracking of reference behaviour, this blunt hammer is used.
+ */
+ migrate = mm->numa_next_reset;
+ if (time_after(now, migrate)) {
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+ xchg(&mm->numa_next_reset, next_scan);
+ }
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ if (p->numa_scan_period == 0)
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+ next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ /*
+ * Do not set pte_numa if the current running node is rate-limited.
+ * This loses statistics on the fault but if we are unwilling to
+ * migrate to this node, it is less likely we can do useful work
+ */
+ if (migrate_ratelimited(numa_node_id()))
+ return;
+
+ start = mm->numa_scan_offset;
+ pages = sysctl_numa_balancing_scan_size;
+ pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ if (!pages)
+ return;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ if (!vma) {
+ reset_ptenuma_scan(p);
+ start = 0;
+ vma = mm->mmap;
+ }
+ for (; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma))
+ continue;
+
+ /* Skip small VMAs. They are not likely to be of relevance */
+ if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+ continue;
+
+ do {
+ start = max(start, vma->vm_start);
+ end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+ end = min(end, vma->vm_end);
+ pages -= change_prot_numa(vma, start, end);
+
+ start = end;
+ if (pages <= 0)
+ goto out;
+ } while (end != vma->vm_end);
+ }
+
+out:
+ /*
+ * It is possible to reach the end of the VMA list but the last few VMAs are
+ * not guaranteed to the vma_migratable. If they are not, we would find the
+ * !migratable VMA on the next scan but not reset the scanner to the start
+ * so check it now.
+ */
+ if (vma)
+ mm->numa_scan_offset = start;
+ else
+ reset_ptenuma_scan(p);
+ up_read(&mm->mmap_sem);
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now - curr->node_stamp > period) {
+ if (!curr->node_stamp)
+ curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ curr->node_stamp = now;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
+ }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -801,72 +1026,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- int global_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long load_avg;
-
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
-
- if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
- }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
- u64 period = sysctl_sched_shares_window;
- u64 now, delta;
- unsigned long load = cfs_rq->load.weight;
-
- if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
- return;
-
- now = rq_of(cfs_rq)->clock_task;
- delta = now - cfs_rq->load_stamp;
-
- /* truncate load history at 4 idle periods */
- if (cfs_rq->load_stamp > cfs_rq->load_last &&
- now - cfs_rq->load_last > 4 * period) {
- cfs_rq->load_period = 0;
- cfs_rq->load_avg = 0;
- delta = period - 1;
- }
-
- cfs_rq->load_stamp = now;
- cfs_rq->load_unacc_exec_time = 0;
- cfs_rq->load_period += delta;
- if (load) {
- cfs_rq->load_last = now;
- cfs_rq->load_avg += delta * load;
- }
-
- /* consider updating load contribution on each fold or truncate */
- if (global_update || cfs_rq->load_period > period
- || !cfs_rq->load_period)
- update_cfs_rq_load_contribution(cfs_rq, global_update);
-
- while (cfs_rq->load_period > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (cfs_rq->load_period));
- cfs_rq->load_period /= 2;
- cfs_rq->load_avg /= 2;
- }
-
- if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- list_del_leaf_cfs_rq(cfs_rq);
-}
-
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
@@ -876,8 +1036,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic_read(&tg->load_weight);
- tg_weight -= cfs_rq->load_contribution;
+ tg_weight = atomic64_read(&tg->load_avg);
+ tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -901,27 +1061,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return shares;
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
- }
-}
# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
@@ -939,6 +1083,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
@@ -958,18 +1104,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
{
+ unsigned int local_n;
+
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val *= runnable_avg_yN_inv[local_n];
+ /* We don't use SRR here since we always want to round down. */
+ return val >> 32;
}
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable)
+{
+ u64 delta, periods;
+ u32 runnable_contrib;
+ int delta_w, decayed = 0;
+
+ delta = now - sa->last_runnable_update;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ sa->runnable_avg_period += runnable_contrib;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ sa->runnable_avg_period += delta;
+
+ return decayed;
+}
+
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+ decays -= se->avg.decay_count;
+ if (!decays)
+ return 0;
+
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.decay_count = 0;
+
+ return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ s64 tg_contrib;
+
+ tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ tg_contrib -= cfs_rq->tg_load_contrib;
+
+ if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic64_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_contrib += tg_contrib;
+ }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long contrib;
+
+ /* The fraction of a cpu used by this cfs_rq */
+ contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ contrib -= cfs_rq->tg_runnable_contrib;
+
+ if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+ atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_contrib += contrib;
+ }
+}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ struct task_group *tg = cfs_rq->tg;
+ int runnable_avg;
+
+ u64 contrib;
+
+ contrib = cfs_rq->tg_load_contrib * tg->shares;
+ se->avg.load_avg_contrib = div64_u64(contrib,
+ atomic64_read(&tg->load_avg) + 1);
+
+ /*
+ * For group entities we need to compute a correction term in the case
+ * that they are consuming <1 cpu so that we would contribute the same
+ * load as a task of equal weight.
+ *
+ * Explicitly co-ordinating this measurement would be expensive, but
+ * fortunately the sum of each cpus contribution forms a usable
+ * lower-bound on the true value.
+ *
+ * Consider the aggregate of 2 contributions. Either they are disjoint
+ * (and the sum represents true value) or they are disjoint and we are
+ * understating by the aggregate of their overlap.
+ *
+ * Extending this to N cpus, for a given overlap, the maximum amount we
+ * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+ * cpus that overlap for this interval and w_i is the interval width.
+ *
+ * On a small machine; the first term is well-bounded which bounds the
+ * total error since w_i is a subset of the period. Whereas on a
+ * larger machine, while this first term can be larger, if w_i is the
+ * of consequential size guaranteed to see n_i*w_i quickly converge to
+ * our upper bound of 1-cpu.
+ */
+ runnable_avg = atomic_read(&tg->runnable_avg);
+ if (runnable_avg < NICE_0_LOAD) {
+ se->avg.load_avg_contrib *= runnable_avg;
+ se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+ }
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+#endif
+
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_contrib = scale_load(contrib);
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.load_avg_contrib;
+
+ if (entity_is_task(se)) {
+ __update_task_entity_contrib(se);
+ } else {
+ __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ __update_group_entity_contrib(se);
+ }
+
+ return se->avg.load_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ long load_contrib)
+{
+ if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ cfs_rq->blocked_load_avg -= load_contrib;
+ else
+ cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ long contrib_delta;
+ u64 now;
+
+ /*
+ * For a group entity we need to use their owned cfs_rq_clock_task() in
+ * case they are the parent of a throttled hierarchy.
+ */
+ if (entity_is_task(se))
+ now = cfs_rq_clock_task(cfs_rq);
+ else
+ now = cfs_rq_clock_task(group_cfs_rq(se));
+
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ return;
+
+ contrib_delta = __update_entity_load_avg_contrib(se);
+
+ if (!update_cfs_rq)
+ return;
+
+ if (se->on_rq)
+ cfs_rq->runnable_load_avg += contrib_delta;
+ else
+ subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ u64 decays;
+
+ decays = now - cfs_rq->last_decay;
+ if (!decays && !force_update)
+ return;
+
+ if (atomic64_read(&cfs_rq->removed_load)) {
+ u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+ subtract_blocked_load_contrib(cfs_rq, removed_load);
+ }
+
+ if (decays) {
+ cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+ atomic64_add(decays, &cfs_rq->decay_counter);
+ cfs_rq->last_decay = now;
+ }
+
+ __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup)
+{
+ /*
+ * We track migrations using entity decay_count <= 0, on a wake-up
+ * migration we use a negative decay count to track the remote decays
+ * accumulated while sleeping.
+ */
+ if (unlikely(se->avg.decay_count <= 0)) {
+ se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+ if (se->avg.decay_count) {
+ /*
+ * In a wake-up migration we have to approximate the
+ * time sleeping. This is because we can't synchronize
+ * clock_task between the two cpus, and it is not
+ * guaranteed to be read-safe. Instead, we can
+ * approximate this using our carried decays, which are
+ * explicitly atomically readable.
+ */
+ se->avg.last_runnable_update -= (-se->avg.decay_count)
+ << 20;
+ update_entity_load_avg(se, 0);
+ /* Indicate that we're now synchronized and on-rq */
+ se->avg.decay_count = 0;
+ }
+ wakeup = 0;
+ } else {
+ __synchronize_entity_decay(se);
+ }
+
+ /* migrated tasks did not contribute to our blocked load */
+ if (wakeup) {
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ update_entity_load_avg(se, 0);
+ }
+
+ cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep)
+{
+ update_entity_load_avg(se, 1);
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+ cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ if (sleep) {
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update) {}
+#endif
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -1096,7 +1701,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- update_cfs_load(cfs_rq, 0);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -1171,6 +1776,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -1191,7 +1797,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
- update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
/*
@@ -1340,6 +1945,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev, 1);
}
cfs_rq->curr = NULL;
}
@@ -1353,9 +1960,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);
/*
- * Update share accounting for long-running entities.
+ * Ensure that runnable average is periodically updated.
*/
- update_entity_shares_tick(cfs_rq);
+ update_entity_load_avg(curr, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1448,6 +2056,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task;
+
+ return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1592,14 +2209,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
- u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
- /* leaving throttled state, advance shares averaging windows */
- cfs_rq->load_stamp += delta;
- cfs_rq->load_last += delta;
-
- /* update entity weight now that we are on_rq again */
- update_cfs_shares(cfs_rq);
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq->clock_task -
+ cfs_rq->throttled_clock_task;
}
#endif
@@ -1611,9 +2223,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, record last load */
+ /* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
- update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttled_clock_task = rq->clock_task;
cfs_rq->throttle_count++;
return 0;
@@ -1628,7 +2240,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- /* account load preceding throttle */
+ /* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
@@ -1652,7 +2264,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rq->nr_running -= task_delta;
cfs_rq->throttled = 1;
- cfs_rq->throttled_timestamp = rq->clock;
+ cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2282,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
- cfs_rq->throttled_timestamp = 0;
update_rq_clock(rq);
/* update hierarchical throttle state */
@@ -2073,8 +2684,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_of(cfs_rq)->clock_task;
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2823,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
}
- if (!se)
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
+ }
hrtick_update(rq);
}
@@ -2266,12 +2884,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
}
- if (!se)
+ if (!se) {
dec_nr_running(rq);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2781,6 +3401,37 @@ unlock:
return new_cpu;
}
+
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Load tracking: accumulate removed load so that it can be processed
+ * when we next update owning cfs_rq under rq->lock. Tasks contribute
+ * to blocked load iff they have a positive decay-count. It can never
+ * be negative here since on-rq tasks have decay-count == 0.
+ */
+ if (se->avg.decay_count) {
+ se->avg.decay_count = -__synchronize_entity_decay(se);
+ atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+ }
+}
+#endif
#endif /* CONFIG_SMP */
static unsigned long
@@ -2907,7 +3558,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
* Batch and idle tasks do not preempt non-idle tasks (their preemption
* is driven by the tick):
*/
- if (unlikely(p->policy != SCHED_NORMAL))
+ if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
@@ -3033,8 +3684,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
#ifdef CONFIG_SMP
/**************************************************
- * Fair scheduling class load-balancing methods:
- */
+ * Fair scheduling class load-balancing methods.
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ * W_i,n/P_i == W_j,n/P_j for all i,j (1)
+ *
+ * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ * W_i,0 = \Sum_j w_i,j (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * is derived from the nice value as per prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
+ *
+ * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ * - infeasible weights;
+ * - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of cpus that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of cpus in
+ * the groups.
+ *
+ * This yields:
+ *
+ * log_2 n 1 n
+ * \Sum { --- * --- * 2^i } = O(n) (5)
+ * i = 0 2^i 2^i
+ * `- size of each group
+ * | | `- number of cpus doing load-balance
+ * | `- freq
+ * `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other cpu in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ * log_2 n
+ * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
+ * k = 0
+ *
+ * And you'll find that:
+ *
+ * A^(log_2 n)_i,j != 0 for all i,j (7)
+ *
+ * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ * O(nm log n), n := nr_cpus, m := nr_tasks (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ * s_k,i
+ * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
+ * S_k
+ *
+ * Where
+ *
+ * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ * rewrite all of this once again.]
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -3300,52 +4065,58 @@ next:
/*
* update tg->load_weight by folding this cpu's load_avg
*/
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
{
- struct cfs_rq *cfs_rq;
- unsigned long flags;
- struct rq *rq;
-
- if (!tg->se[cpu])
- return 0;
-
- rq = cpu_rq(cpu);
- cfs_rq = tg->cfs_rq[cpu];
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- update_rq_clock(rq);
- update_cfs_load(cfs_rq, 1);
+ struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
- /*
- * We need to update shares after updating tg->load_weight in
- * order to adjust the weight of groups with long running tasks.
- */
- update_cfs_shares(cfs_rq);
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ return;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
- return 0;
+ if (se) {
+ update_entity_load_avg(se, 1);
+ /*
+ * We pivot on our runnable average having decayed to zero for
+ * list removal. This generally implies that all our children
+ * have also been removed (modulo rounding error or bandwidth
+ * control); however, such cases are rare and we can fix these
+ * at enqueue.
+ *
+ * TODO: fix up out-of-order children on enqueue.
+ */
+ if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+ list_del_leaf_cfs_rq(cfs_rq);
+ } else {
+ struct rq *rq = rq_of(cfs_rq);
+ update_rq_runnable_avg(rq, rq->nr_running);
+ }
}
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
{
- struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
- rcu_read_lock();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- update_shares_cpu(cfs_rq->tg, cpu);
+ /*
+ * Note: We may want to consider periodically releasing
+ * rq->lock about these updates so that creating many task
+ * groups does not result in continually extending hold time.
+ */
+ __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
}
- rcu_read_unlock();
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
/*
@@ -3397,7 +4168,7 @@ static unsigned long task_h_load(struct task_struct *p)
return load;
}
#else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
{
}
@@ -4457,12 +5228,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
+ update_rq_runnable_avg(this_rq, 1);
+
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
raw_spin_unlock(&this_rq->lock);
- update_shares(this_cpu);
+ update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -4717,7 +5490,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
int update_next_balance = 0;
int need_serialize;
- update_shares(cpu);
+ update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -4954,6 +5727,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ if (sched_feat_numa(NUMA))
+ task_tick_numa(rq, curr);
+
+ update_rq_runnable_avg(rq, 1);
}
/*
@@ -5046,6 +5824,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ /*
+ * Remove our load from contribution when we leave sched_fair
+ * and ensure we don't carry in an old decay_count if we
+ * switch back.
+ */
+ if (p->se.avg.decay_count) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+ __synchronize_entity_decay(&p->se);
+ subtract_blocked_load_contrib(cfs_rq,
+ p->se.avg.load_avg_contrib);
+ }
+#endif
}
/*
@@ -5092,11 +5884,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ atomic64_set(&cfs_rq->decay_counter, 1);
+ atomic64_set(&cfs_rq->removed_load, 0);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct cfs_rq *cfs_rq;
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5128,8 +5925,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
if (!on_rq)
p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
set_task_rq(p, task_cpu(p));
- if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ if (!on_rq) {
+ cfs_rq = cfs_rq_of(&p->se);
+ p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+ /*
+ * migrate_task_rq_fair() will have removed our previous
+ * contribution, but we must synchronize for ongoing future
+ * decay.
+ */
+ p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+ }
}
void free_fair_sched_group(struct task_group *tg)
@@ -5214,10 +6022,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
cfs_rq->tg = tg;
cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
- /* allow initial update_cfs_load() to truncate */
- cfs_rq->load_stamp = 1;
-#endif
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
@@ -5319,7 +6123,9 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
-
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ .migrate_task_rq = migrate_task_rq_fair,
+#endif
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,