aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorViresh Kumar <viresh.kumar@linaro.org>2013-04-02 21:59:48 +0530
committerViresh Kumar <viresh.kumar@linaro.org>2013-04-02 21:59:48 +0530
commite801e36a4d4ffccbcbb63f2ece35d13cad4f280b (patch)
tree31676a773066bf680e58fa8d035ca88f3b64aafb
parent089165092916ec98d675e078dc5782da5d29f181 (diff)
parent63a67f13217cbb76305b79804e73e5c8b8315370 (diff)
Merge branch 'big-LITTLE-MP-upstream-v2' into linux-linaro-MP-upstream-3.9-rc3big-LITTLE-MP-upstream-v1
-rw-r--r--arch/arm/include/asm/topology.h61
-rw-r--r--arch/arm/kernel/topology.c9
-rw-r--r--arch/ia64/include/asm/topology.h1
-rw-r--r--arch/tile/include/asm/topology.h1
-rw-r--r--include/linux/sched.h16
-rw-r--r--include/linux/topology.h4
-rw-r--r--kernel/sched/core.c129
-rw-r--r--kernel/sched/fair.c312
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle_task.c7
-rw-r--r--kernel/sched/sched.h43
-rw-r--r--linaro/configs/big-LITTLE-MP.conf5
12 files changed, 509 insertions, 81 deletions
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 58b8b84adcd..d394578f7a8 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -34,6 +34,67 @@ static inline void store_cpu_topology(unsigned int cpuid) { }
#endif
+#ifdef CONFIG_SCHED_MC
+/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
+#ifndef SD_MC_INIT
+#define SD_MC_INIT (struct sched_domain) { \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 16, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = 1, \
+ .busy_idx = 2, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
+ \
+ .flags = 1*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 0*SD_BALANCE_WAKE \
+ | 1*SD_WAKE_AFFINE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 1*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_local_flags(SD_SHARE_PKG_RESOURCES)\
+ | 0*SD_SERIALIZE \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+}
+#endif
+#endif /* CONFIG_SCHED_MC */
+
+/* Common values for CPUs */
+#ifndef SD_CPU_INIT
+#define SD_CPU_INIT (struct sched_domain) { \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 16, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = 1, \
+ .busy_idx = 2, \
+ .idle_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
+ \
+ .flags = 1*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 0*SD_BALANCE_WAKE \
+ | 1*SD_WAKE_AFFINE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 0*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_local_flags(0) \
+ | 0*SD_SERIALIZE \
+ | 1*SD_PREFER_SIBLING \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+}
+#endif
+
#include <asm-generic/topology.h>
#endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 79282ebcd93..f89a4a2b6ed 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -201,6 +201,15 @@ static inline void update_cpu_power(unsigned int cpuid, unsigned int mpidr) {}
*/
struct cputopo_arm cpu_topology[NR_CPUS];
+int arch_sd_local_flags(int level)
+{
+ /* Powergate at threading level doesn't make sense */
+ if (level & SD_SHARE_CPUPOWER)
+ return 1*SD_SHARE_POWERDOMAIN;
+
+ return 0*SD_SHARE_POWERDOMAIN;
+}
+
const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index a2496e449b7..6d0b61741c1 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -65,6 +65,7 @@ void build_cpu_to_node_map(void);
| SD_BALANCE_EXEC \
| SD_BALANCE_FORK \
| SD_WAKE_AFFINE, \
+ | arch_sd_local_flags(0)\
.last_balance = jiffies, \
.balance_interval = 1, \
.nr_balance_failed = 0, \
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index d5e86c9f74f..adc871020d4 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -71,6 +71,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
| 0*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_local_flags(0) \
| 0*SD_SERIALIZE \
, \
.last_balance = jiffies, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4afded2be58..43d10ee10cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -809,6 +809,7 @@ enum cpu_idle_type {
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
+#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
@@ -959,6 +960,12 @@ struct sched_domain {
unsigned long span[0];
};
+struct sched_domain_rq {
+ struct sched_domain *sd;
+ unsigned long flags;
+ struct rcu_head rcu; /* used during destruction */
+};
+
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
{
return to_cpumask(sd->span);
@@ -1034,6 +1041,7 @@ struct sched_domain;
#else
#define ENQUEUE_WAKING 0
#endif
+#define ENQUEUE_NEWTASK 8
#define DEQUEUE_SLEEP 1
@@ -1160,13 +1168,7 @@ struct sched_entity {
struct cfs_rq *my_q;
#endif
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
- /* Per-entity load-tracking */
+#ifdef CONFIG_SMP
struct sched_avg avg;
#endif
};
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d3cf0d6e771..3eab2933c8e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -99,6 +99,8 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 1*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_local_flags(SD_SHARE_CPUPOWER|\
+ SD_SHARE_PKG_RESOURCES) \
| 0*SD_SERIALIZE \
| 0*SD_PREFER_SIBLING \
| arch_sd_sibling_asym_packing() \
@@ -131,6 +133,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_local_flags(SD_SHARE_PKG_RESOURCES)\
| 0*SD_SERIALIZE \
, \
.last_balance = jiffies, \
@@ -161,6 +164,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_local_flags(0) \
| 0*SD_SERIALIZE \
| 1*SD_PREFER_SIBLING \
, \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 870c3242855..06933795dc2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1561,14 +1561,11 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
p->se.avg.runnable_avg_period = 0;
p->se.avg.runnable_avg_sum = 0;
+ p->se.avg.decay_count = 0;
+ p->se.avg.load_avg_contrib = 0;
#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -1714,7 +1711,7 @@ void wake_up_new_task(struct task_struct *p)
#endif
rq = __task_rq_lock(p);
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_NEWTASK);
p->on_rq = 1;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
@@ -2533,7 +2530,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
void update_idle_cpu_load(struct rq *this_rq)
{
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long load = this_rq->load.weight;
+ unsigned long load = (unsigned long)this_rq->cfs.runnable_load_avg;
unsigned long pending_updates;
/*
@@ -2583,7 +2580,7 @@ static void update_cpu_load_active(struct rq *this_rq)
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, this_rq->load.weight, 1);
+ __update_cpu_load(this_rq, this_rq->cfs.runnable_load_avg, 1);
calc_load_account_active(this_rq);
}
@@ -2689,8 +2686,8 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
- update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0);
+ update_cpu_load_active(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -5602,6 +5599,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
destroy_sched_domain(sd, cpu);
}
+static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
+{
+ if (!sd_rq)
+ return;
+
+ destroy_sched_domains(sd_rq->sd, cpu);
+ kfree_rcu(sd_rq, rcu);
+}
+
/*
* Keep a special pointer to the highest sched_domain that has
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
@@ -5632,10 +5638,23 @@ static void update_top_cache_domain(int cpu)
* hold the hotplug lock.
*/
static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
+ int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct sched_domain *tmp;
+ struct sched_domain_rq *tmp_rq;
+ struct sched_domain *tmp, *sd = NULL;
+
+ /*
+ * If we don't have any sched_domain and associated object, we can
+ * directly jump to the attach sequence otherwise we try to degenerate
+ * the sched_domain
+ */
+ if (!sd_rq)
+ goto attach;
+
+ /* Get a pointer to the 1st sched_domain */
+ sd = sd_rq->sd;
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -5658,15 +5677,19 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
+ /* update sched_domain_rq */
+ sd_rq->sd = sd;
}
+attach:
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
- tmp = rq->sd;
- rcu_assign_pointer(rq->sd, sd);
- destroy_sched_domains(tmp, cpu);
+ tmp_rq = rq->sd_rq;
+ rcu_assign_pointer(rq->sd_rq, sd_rq);
+ destroy_sched_domain_rq(tmp_rq, cpu);
+ update_packing_domain(cpu);
update_top_cache_domain(cpu);
}
@@ -5695,12 +5718,14 @@ struct sd_data {
};
struct s_data {
+ struct sched_domain_rq ** __percpu sd_rq;
struct sched_domain ** __percpu sd;
struct root_domain *rd;
};
enum s_alloc {
sa_rootdomain,
+ sa_sd_rq,
sa_sd,
sa_sd_storage,
sa_none,
@@ -5935,7 +5960,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
return;
update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+ atomic_set(&sg->sgp->nr_busy_cpus, 0);
}
int __weak arch_sd_sibling_asym_packing(void)
@@ -5943,6 +5968,11 @@ int __weak arch_sd_sibling_asym_packing(void)
return 0*SD_ASYM_PACKING;
}
+int __weak arch_sd_local_flags(int level)
+{
+ return 1*SD_SHARE_POWERDOMAIN;
+}
+
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
@@ -6011,6 +6041,8 @@ static void set_domain_attribute(struct sched_domain *sd,
static void __sdt_free(const struct cpumask *cpu_map);
static int __sdt_alloc(const struct cpumask *cpu_map);
+static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d);
+static int __sdrq_alloc(const struct cpumask *cpu_map, struct s_data *d);
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
@@ -6019,6 +6051,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd_rq:
+ __sdrq_free(cpu_map, d); /* fall through */
+ free_percpu(d->sd_rq); /* fall through */
case sa_sd:
free_percpu(d->sd); /* fall through */
case sa_sd_storage:
@@ -6038,9 +6073,14 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
d->sd = alloc_percpu(struct sched_domain *);
if (!d->sd)
return sa_sd_storage;
+ d->sd_rq = alloc_percpu(struct sched_domain_rq *);
+ if (!d->sd_rq)
+ return sa_sd;
+ if (__sdrq_alloc(cpu_map, d))
+ return sa_sd_rq;
d->rd = alloc_rootdomain();
if (!d->rd)
- return sa_sd;
+ return sa_sd_rq;
return sa_rootdomain;
}
@@ -6132,6 +6172,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
| 0*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_PKG_RESOURCES
+ | 1*SD_SHARE_POWERDOMAIN
| 1*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
| sd_local_flags(level)
@@ -6466,6 +6507,46 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}
+static int __sdrq_alloc(const struct cpumask *cpu_map, struct s_data *d)
+{
+ int j;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain_rq *sd_rq;
+
+ sd_rq = kzalloc_node(sizeof(struct sched_domain_rq),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd_rq)
+ return -ENOMEM;
+
+ *per_cpu_ptr(d->sd_rq, j) = sd_rq;
+ }
+
+ return 0;
+}
+
+static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d)
+{
+ int j;
+
+ for_each_cpu(j, cpu_map)
+ if (*per_cpu_ptr(d->sd_rq, j))
+ kfree(*per_cpu_ptr(d->sd_rq, j));
+}
+
+static void build_sched_domain_rq(struct s_data *d, int cpu)
+{
+ struct sched_domain_rq *sd_rq;
+ struct sched_domain *sd;
+
+ /* Attach sched_domain to sched_domain_rq */
+ sd = *per_cpu_ptr(d->sd, cpu);
+ sd_rq = *per_cpu_ptr(d->sd_rq, cpu);
+ sd_rq->sd = sd;
+ /* Init flags */
+ set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+}
+
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
struct s_data *d, const struct cpumask *cpu_map,
struct sched_domain_attr *attr, struct sched_domain *child,
@@ -6495,6 +6576,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr)
{
enum s_alloc alloc_state = sa_none;
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
struct s_data d;
int i, ret = -ENOMEM;
@@ -6547,11 +6629,18 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
+ /* Init objects that must follow the sched_domain lifecycle */
+ for_each_cpu(i, cpu_map) {
+ build_sched_domain_rq(&d, i);
+ }
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- sd = *per_cpu_ptr(d.sd, i);
- cpu_attach_domain(sd, d.rd, i);
+ sd_rq = *per_cpu_ptr(d.sd_rq, i);
+ cpu_attach_domain(sd_rq, d.rd, i);
+ /* claim allocation of sched_domain_rq object */
+ *per_cpu_ptr(d.sd_rq, i) = NULL;
}
rcu_read_unlock();
@@ -6982,7 +7071,7 @@ void __init sched_init(void)
rq->last_load_update_tick = jiffies;
#ifdef CONFIG_SMP
- rq->sd = NULL;
+ rq->sd_rq = NULL;
rq->rd = NULL;
rq->cpu_power = SCHED_POWER_SCALE;
rq->post_schedule = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc..d66fa87ebdb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -160,6 +160,76 @@ void sched_init_granularity(void)
update_sysctl();
}
+
+#ifdef CONFIG_SMP
+/*
+ * Save the id of the optimal CPU that should be used to pack small tasks
+ * The value -1 is used when no buddy has been found
+ */
+DEFINE_PER_CPU(int, sd_pack_buddy);
+
+/*
+ * Look for the best buddy CPU that can be used to pack small tasks
+ * We make the assumption that it doesn't wort to pack on CPU that share the
+ * same powerline. We look for the 1st sched_domain without the
+ * SD_SHARE_POWERDOMAIN flag. Then we look for the sched_group with the lowest
+ * power per core based on the assumption that their power efficiency is
+ * better
+ */
+void update_packing_domain(int cpu)
+{
+ struct sched_domain *sd;
+ int id = -1;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_POWERDOMAIN);
+ if (!sd)
+ sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+ else
+ sd = sd->parent;
+
+ while (sd && (sd->flags & SD_LOAD_BALANCE)
+ && !(sd->flags & SD_SHARE_POWERDOMAIN)) {
+ struct sched_group *sg = sd->groups;
+ struct sched_group *pack = sg;
+ struct sched_group *tmp;
+
+ /*
+ * The sched_domain of a CPU points on the local sched_group
+ * and the 1st CPU of this local group is a good candidate
+ */
+ id = cpumask_first(sched_group_cpus(pack));
+
+ /* loop the sched groups to find the best one */
+ for (tmp = sg->next; tmp != sg; tmp = tmp->next) {
+ if (tmp->sgp->power * pack->group_weight >
+ pack->sgp->power * tmp->group_weight)
+ continue;
+
+ if ((tmp->sgp->power * pack->group_weight ==
+ pack->sgp->power * tmp->group_weight)
+ && (cpumask_first(sched_group_cpus(tmp)) >= id))
+ continue;
+
+ /* we have found a better group */
+ pack = tmp;
+
+ /* Take the 1st CPU of the new group */
+ id = cpumask_first(sched_group_cpus(pack));
+ }
+
+ /* Look for another CPU than itself */
+ if (id != cpu)
+ break;
+
+ sd = sd->parent;
+ }
+
+ pr_debug("CPU%d packing on CPU%d\n", cpu, id);
+ per_cpu(sd_pack_buddy, cpu) = id;
+}
+
+#endif /* CONFIG_SMP */
+
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
@@ -1109,8 +1179,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
/*
* We choose a half-life close to 1 scheduling period.
* Note: The tables below are dependent on this value.
@@ -1503,8 +1572,9 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
/* Add the load generated by se into cfs_rq's child load-average */
static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
- int wakeup)
+ int flags)
{
+ int wakeup = flags & ENQUEUE_WAKEUP;
/*
* We track migrations using entity decay_count <= 0, on a wake-up
* migration we use a negative decay count to track the remote decays
@@ -1538,6 +1608,12 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
update_entity_load_avg(se, 0);
}
+ /*
+ * set the initial load avg of new task same as its load
+ * in order to avoid brust fork make few cpu too heavier
+ */
+ if (flags & ENQUEUE_NEWTASK)
+ se->avg.load_avg_contrib = se->load.weight;
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
@@ -1562,6 +1638,16 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit(int this_cpu, struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
@@ -1699,7 +1785,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+ enqueue_entity_load_avg(cfs_rq, se, flags);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -2897,7 +2983,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->load.weight;
+ return (unsigned long)cpu_rq(cpu)->cfs.runnable_load_avg;
}
/*
@@ -2944,7 +3030,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
if (nr_running)
- return rq->load.weight / nr_running;
+ return (unsigned long)rq->cfs.runnable_load_avg / nr_running;
return 0;
}
@@ -2973,7 +3059,8 @@ static void task_waking_fair(struct task_struct *p)
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * effective_load() calculates the load change as seen from the root_task_group
+ * effective_load() calculates the runnable load average change as seen from
+ * the root_task_group
*
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
@@ -3021,6 +3108,9 @@ static void task_waking_fair(struct task_struct *p)
* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
* 4/7) times the weight of the group.
+ *
+ * After get effective_load of the load moving, will engaged the sched entity's
+ * runnable avg.
*/
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
@@ -3095,6 +3185,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
struct task_group *tg;
unsigned long weight;
int balanced;
+ int runnable_avg;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
@@ -3110,13 +3201,19 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
if (sync) {
tg = task_group(current);
weight = current->se.load.weight;
+ runnable_avg = current->se.avg.runnable_avg_sum * NICE_0_LOAD
+ / (current->se.avg.runnable_avg_period + 1);
- this_load += effective_load(tg, this_cpu, -weight, -weight);
- load += effective_load(tg, prev_cpu, 0, -weight);
+ this_load += effective_load(tg, this_cpu, -weight, -weight)
+ * runnable_avg >> NICE_0_SHIFT;
+ load += effective_load(tg, prev_cpu, 0, -weight)
+ * runnable_avg >> NICE_0_SHIFT;
}
tg = task_group(p);
weight = p->se.load.weight;
+ runnable_avg = p->se.avg.runnable_avg_sum * NICE_0_LOAD
+ / (p->se.avg.runnable_avg_period + 1);
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -3128,16 +3225,18 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* task to be woken on this_cpu.
*/
if (this_load > 0) {
- s64 this_eff_load, prev_eff_load;
+ s64 this_eff_load, prev_eff_load, tmp_eff_load;
this_eff_load = 100;
this_eff_load *= power_of(prev_cpu);
- this_eff_load *= this_load +
- effective_load(tg, this_cpu, weight, weight);
+ tmp_eff_load = effective_load(tg, this_cpu, weight, weight)
+ * runnable_avg >> NICE_0_SHIFT;
+ this_eff_load *= this_load + tmp_eff_load;
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= power_of(this_cpu);
- prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+ prev_eff_load *= load + (effective_load(tg, prev_cpu, 0, weight)
+ * runnable_avg >> NICE_0_SHIFT);
balanced = this_eff_load <= prev_eff_load;
} else
@@ -3292,6 +3391,50 @@ done:
return target;
}
+static bool is_buddy_busy(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u32 sum = rq->avg.runnable_avg_sum;
+ u32 period = rq->avg.runnable_avg_period;
+
+ sum = min(sum, period);
+
+ /*
+ * A busy buddy is a CPU with a high load or a small load with a lot of
+ * running tasks.
+ */
+ return (sum > (period / (rq->nr_running + 2)));
+}
+
+static bool is_light_task(struct task_struct *p)
+{
+ /* A light task runs less than 20% in average */
+ return ((p->se.avg.runnable_avg_sum * 5) <
+ (p->se.avg.runnable_avg_period));
+}
+
+static int check_pack_buddy(int cpu, struct task_struct *p)
+{
+ int buddy = per_cpu(sd_pack_buddy, cpu);
+
+ /* No pack buddy for this CPU */
+ if (buddy == -1)
+ return false;
+
+ /* buddy is not an allowed CPU */
+ if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p)))
+ return false;
+
+ /*
+ * If the task is a small one and the buddy is not overloaded,
+ * we use buddy cpu
+ */
+ if (!is_light_task(p) || is_buddy_busy(buddy))
+ return false;
+
+ return true;
+}
+
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -3320,6 +3463,10 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
want_affine = 1;
new_cpu = prev_cpu;
+
+ /* We pack only at wake up and not new task */
+ if (check_pack_buddy(new_cpu, p))
+ return per_cpu(sd_pack_buddy, new_cpu);
}
rcu_read_lock();
@@ -3394,12 +3541,6 @@ unlock:
}
/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
@@ -3422,7 +3563,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
}
}
-#endif
#endif /* CONFIG_SMP */
static unsigned long
@@ -3970,6 +4110,15 @@ static unsigned long task_h_load(struct task_struct *p);
static const unsigned int sched_nr_migrate_break = 32;
+static unsigned long task_h_load_avg(struct task_struct *p)
+{
+ u32 period = p->se.avg.runnable_avg_period;
+ if (!period)
+ return 0;
+
+ return task_h_load(p) * p->se.avg.runnable_avg_sum / period;
+}
+
/*
* move_tasks tries to move up to imbalance weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
@@ -4005,12 +4154,13 @@ static int move_tasks(struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
goto next;
- load = task_h_load(p);
+ load = task_h_load_avg(p);
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
+ if (sched_feat(LB_MIN) && load < 204 && !env->sd->nr_balance_failed)
goto next;
- if ((load / 2) > env->imbalance)
+ if ((load / 2) > env->imbalance &&
+ (env->idle != CPU_IDLE && env->idle != CPU_NEWLY_IDLE))
goto next;
if (!can_migrate_task(p, env))
@@ -4415,7 +4565,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
{
unsigned long nr_running, max_nr_running, min_nr_running;
unsigned long load, max_cpu_load, min_cpu_load;
- unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ unsigned int balance_cpu = -1, first_idle_cpu = 0, overloaded_cpu = 0;
unsigned long avg_load_per_task = 0;
int i;
@@ -4453,6 +4603,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
max_nr_running = nr_running;
if (min_nr_running > nr_running)
min_nr_running = nr_running;
+
+ if ((load > rq->cpu_power)
+ && ((rq->cpu_power*env->sd->imbalance_pct) < (env->dst_rq->cpu_power*100))
+ && (load > target_load(env->dst_cpu, load_idx)))
+ overloaded_cpu = 1;
}
sgs->group_load += load;
@@ -4498,6 +4653,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
(max_nr_running - min_nr_running) > 1)
sgs->group_imb = 1;
+ /*
+ * The load contrib of a CPU exceeds its capacity, we should try to
+ * find a better CPU with more capacity
+ */
+ if (overloaded_cpu)
+ sgs->group_imb = 1;
+
+ /*
+ * When idle balancing pull tasks if more than one task per cpu
+ * in group
+ */
+ if (env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) {
+ if (group->group_weight < sgs->sum_nr_running)
+ sgs->group_imb = 1;
+ }
+
sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
SCHED_POWER_SCALE);
if (!sgs->group_capacity)
@@ -4725,8 +4896,13 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
- /* Move if we gain throughput */
- if (pwr_move > pwr_now)
+ /*
+ * Move if we gain throughput, or if we have cpus idling while others
+ * are running more than one task.
+ */
+ if ((pwr_move > pwr_now) ||
+ (sds->busiest_group_weight < sds->busiest_nr_running &&
+ (env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE)))
env->imbalance = sds->busiest_load_per_task;
}
@@ -4911,6 +5087,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
+ struct rq *overloaded = NULL, *dst_rq = cpu_rq(env->dst_cpu);
unsigned long max_load = 0;
int i;
@@ -4930,6 +5107,17 @@ static struct rq *find_busiest_queue(struct lb_env *env,
wl = weighted_cpuload(i);
/*
+ * If the task requires more power than the current CPU
+ * capacity and the dst_cpu has more capacity, keep the
+ * dst_cpu in mind
+ */
+ if ((rq->nr_running == 1)
+ && (rq->cfs.runnable_load_avg > rq->cpu_power)
+ && (rq->cfs.runnable_load_avg > dst_rq->cfs.runnable_load_avg)
+ && ((rq->cpu_power*env->sd->imbalance_pct) < (dst_rq->cpu_power*100)))
+ overloaded = rq;
+
+ /*
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu power.
*/
@@ -4950,6 +5138,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
}
}
+ if (!busiest)
+ busiest = overloaded;
+
return busiest;
}
@@ -4977,6 +5168,9 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ if ((power_of(env->src_cpu)*sd->imbalance_pct) < (power_of(env->dst_cpu)*100))
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -5035,6 +5229,10 @@ redo:
ld_moved = 0;
lb_iterations = 1;
+
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -5043,8 +5241,6 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
update_h_load(env.src_cpu);
@@ -5345,7 +5541,25 @@ static struct {
static inline int find_new_ilb(int call_cpu)
{
+ struct sched_domain *sd;
int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int buddy = per_cpu(sd_pack_buddy, call_cpu);
+
+ /*
+ * If we have a pack buddy CPU, we try to run load balance on a CPU
+ * that is close to the buddy.
+ */
+ if (buddy != -1)
+ for_each_domain(buddy, sd) {
+ if (sd->flags & SD_SHARE_CPUPOWER)
+ continue;
+
+ ilb = cpumask_first_and(sched_domain_span(sd),
+ nohz.idle_cpus_mask);
+
+ if (ilb < nr_cpu_ids)
+ break;
+ }
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
@@ -5392,31 +5606,39 @@ static inline void nohz_balance_exit_idle(int cpu)
static inline void set_cpu_sd_state_busy(void)
{
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd_rq = get_sched_domain_rq(cpu);
+
+ if (!sd_rq || !test_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)))
+ goto unlock;
+ clear_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+
+ for_each_domain_from_rq(sd_rq, sd)
atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
void set_cpu_sd_state_idle(void)
{
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd_rq = get_sched_domain_rq(cpu);
+
+ if (!sd_rq || test_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)))
+ goto unlock;
+ set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+
+ for_each_domain_from_rq(sd_rq, sd)
atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
@@ -5621,6 +5843,10 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
if (rq->nr_running >= 2)
goto need_kick;
+ /* load contrib is higher than cpu capacity */
+ if (rq->cfs.runnable_load_avg > rq->cpu_power)
+ goto need_kick;
+
rcu_read_lock();
for_each_domain(cpu, sd) {
struct sched_group *sg = sd->groups;
@@ -5673,7 +5899,12 @@ static void run_rebalance_domains(struct softirq_action *h)
static inline int on_null_domain(int cpu)
{
- return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+ struct sched_domain_rq *sd_rq =
+ rcu_dereference_sched(cpu_rq(cpu)->sd_rq);
+ struct sched_domain *sd = NULL;
+ if (sd_rq)
+ sd = sd_rq->sd;
+ return !sd;
}
/*
@@ -6114,9 +6345,8 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
-#ifdef CONFIG_FAIR_GROUP_SCHED
.migrate_task_rq = migrate_task_rq_fair,
-#endif
+
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395..4760a2d2da4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -65,7 +65,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
-SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(LB_MIN, true)
/*
* Apply the automatic NUMA scheduling policy. Enabled automatically
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae..27cd379a754 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,12 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
+
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+ /* Update rq's load with elapsed idle time */
+ idle_exit(smp_processor_id(), rq);
+}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
@@ -86,6 +92,7 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .pre_schedule = pre_schedule_idle,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469..01833d81684 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,12 +227,6 @@ struct cfs_rq {
#endif
#ifdef CONFIG_SMP
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* CFS Load tracking
* Under CFS, load is tracked on a per-entity basis and aggregated up.
@@ -242,8 +236,7 @@ struct cfs_rq {
u64 runnable_load_avg, blocked_load_avg;
atomic64_t decay_counter, removed_load;
u64 last_decay;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* These always depend on CONFIG_FAIR_GROUP_SCHED */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
u32 tg_runnable_contrib;
u64 tg_load_contrib;
@@ -417,7 +410,7 @@ struct rq {
#ifdef CONFIG_SMP
struct root_domain *rd;
- struct sched_domain *sd;
+ struct sched_domain_rq *sd_rq;
unsigned long cpu_power;
@@ -505,21 +498,37 @@ DECLARE_PER_CPU(struct rq, runqueues);
#ifdef CONFIG_SMP
-#define rcu_dereference_check_sched_domain(p) \
+#define rcu_dereference_check_sched_domain_rq(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
+#define get_sched_domain_rq(cpu) \
+ rcu_dereference_check_sched_domain_rq(cpu_rq(cpu)->sd_rq)
+
+#define rcu_dereference_check_sched_domain(cpu) ({ \
+ struct sched_domain_rq *__sd_rq = get_sched_domain_rq(cpu); \
+ struct sched_domain *__sd = NULL; \
+ if (__sd_rq) \
+ __sd = __sd_rq->sd; \
+ __sd; \
+})
+
+#define sched_rq_flags(sd_rq) (&sd_rq->flags)
+
/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * The domain tree (rq->sd_rq) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ for (__sd = rcu_dereference_check_sched_domain(cpu); \
__sd; __sd = __sd->parent)
+#define for_each_domain_from_rq(sd_rq, __sd) \
+ for (__sd = sd_rq->sd; __sd; __sd = __sd->parent)
+
#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
/**
@@ -879,6 +888,8 @@ extern const struct sched_class idle_sched_class;
extern void trigger_load_balance(struct rq *rq, int cpu);
extern void idle_balance(int this_cpu, struct rq *this_rq);
+extern void idle_exit(int this_cpu, struct rq *this_rq);
+extern void update_packing_domain(int cpu);
#else /* CONFIG_SMP */
@@ -886,6 +897,14 @@ static inline void idle_balance(int cpu, struct rq *rq)
{
}
+static inline void idle_exit(int this_cpu, struct rq *this_rq)
+{
+}
+
+static inline void update_packing_domain(int cpu)
+{
+}
+
#endif
extern void sysrq_sched_debug_show(void);
diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf
new file mode 100644
index 00000000000..80bf45fa6e2
--- /dev/null
+++ b/linaro/configs/big-LITTLE-MP.conf
@@ -0,0 +1,5 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_NO_HZ=y
+CONFIG_SCHED_MC=y