From 313c69010ed52a7bd52095708049e16c380c8e15 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Jul 2012 14:47:10 +0100 Subject: configs: Add config fragments for big LITTLE MP This patch adds config fragments used to enable most of the features used by big LITTLE MP. Signed-off-by: Viresh Kumar --- linaro/configs/big-LITTLE-MP.conf | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 linaro/configs/big-LITTLE-MP.conf diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf new file mode 100644 index 00000000000..25768457406 --- /dev/null +++ b/linaro/configs/big-LITTLE-MP.conf @@ -0,0 +1,4 @@ +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_NO_HZ=y -- cgit v1.2.3 From 83952e566d1c41b2e0a70321e9d56cb8162f9a6d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 11 Jul 2012 09:55:22 +0100 Subject: linaro/configs: Update big LITTLE MP fragment for task placement work CONFIG_HMP_FAST_CPU_MASK and CONFIG_HMP_SLOW_CPU_MASK must be set correctly by user platform. For now they are marked 0-1 and 2-3. Signed-off-by: Viresh Kumar --- linaro/configs/big-LITTLE-MP.conf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf index 25768457406..df35474eff1 100644 --- a/linaro/configs/big-LITTLE-MP.conf +++ b/linaro/configs/big-LITTLE-MP.conf @@ -2,3 +2,8 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_NO_HZ=y +CONFIG_SCHED_MC=y +CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y +CONFIG_SCHED_HMP=y +CONFIG_HMP_FAST_CPU_MASK="0-1" +CONFIG_HMP_SLOW_CPU_MASK="2-3" -- cgit v1.2.3 From c4a782d985da03016e2b15aaef90bfc5cfe426f6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 12 Sep 2012 09:04:17 +0530 Subject: config-frag/big-LITTLE: Use device-tree to provide fast/slow CPU list for HMP Currently there are two ways of passing list of fast-slow CPU's to kernel. One via configs and other via DT. Code tries to get them via configs first an then try for DT. To make it configurable via DT by default, make config strings empty. Signed-off-by: Viresh Kumar Reported-by: Sudeep KarkadaNagesha --- linaro/configs/big-LITTLE-MP.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf index df35474eff1..df9cfa0554c 100644 --- a/linaro/configs/big-LITTLE-MP.conf +++ b/linaro/configs/big-LITTLE-MP.conf @@ -5,5 +5,5 @@ CONFIG_NO_HZ=y CONFIG_SCHED_MC=y CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y CONFIG_SCHED_HMP=y -CONFIG_HMP_FAST_CPU_MASK="0-1" -CONFIG_HMP_SLOW_CPU_MASK="2-3" +CONFIG_HMP_FAST_CPU_MASK="" +CONFIG_HMP_SLOW_CPU_MASK="" -- cgit v1.2.3 From 2732b0467a356e7cc08bf1e0f33481330335657a Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 10 Oct 2012 14:51:25 +0100 Subject: linaro/configs: Enable HMP priority filter by default This updates linaro config fragments to enable the HMP priority filter by default. Signed-off-by: Morten Rasmussen --- linaro/configs/big-LITTLE-MP.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf index df9cfa0554c..d1c9da2354d 100644 --- a/linaro/configs/big-LITTLE-MP.conf +++ b/linaro/configs/big-LITTLE-MP.conf @@ -7,3 +7,5 @@ CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y CONFIG_SCHED_HMP=y CONFIG_HMP_FAST_CPU_MASK="" CONFIG_HMP_SLOW_CPU_MASK="" +CONFIG_SCHED_HMP_PRIO_FILTER=y +CONFIG_SCHED_HMP_PRIO_FILTER_VAL=5 -- cgit v1.2.3 From 7c88e2bd50fc093c96c912090b712b682ae72641 Mon Sep 17 00:00:00 2001 From: Liviu Dudau Date: Fri, 16 Nov 2012 18:32:44 +0000 Subject: linaro/configs: big-LITTLE-MP: Enable the new tunable sysfs interface by default. Enable the new tunable sysfs interface for HMP scaling invariants. Signed-of-by: Liviu Dudau --- linaro/configs/big-LITTLE-MP.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf index d1c9da2354d..8cc2be049a4 100644 --- a/linaro/configs/big-LITTLE-MP.conf +++ b/linaro/configs/big-LITTLE-MP.conf @@ -7,5 +7,7 @@ CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y CONFIG_SCHED_HMP=y CONFIG_HMP_FAST_CPU_MASK="" CONFIG_HMP_SLOW_CPU_MASK="" +CONFIG_HMP_VARIABLE_SCALE=y +CONFIG_HMP_FREQUENCY_INVARIANT_SCALE=y CONFIG_SCHED_HMP_PRIO_FILTER=y CONFIG_SCHED_HMP_PRIO_FILTER_VAL=5 -- cgit v1.2.3 From 8aa6a97721cc5146ee3e70cd477e10c4d0047d0c Mon Sep 17 00:00:00 2001 From: Lokesh Vutla Date: Wed, 13 Mar 2013 06:52:33 +0000 Subject: ARM: hw_breakpoint: Enable debug powerdown only if system supports 'has_ossr' Commit {9a6eb31 ARM: hw_breakpoint: Debug powerdown support for self-hosted debug} introduces debug powerdown support for self-hosted debug. While merging the patch 'has_ossr' check was removed which was needed for hardwares which doesn't support self-hosted debug. Pandaboard (A9) is one such hardware and Dietmar's orginial patch did mention this issue. Without that check on Panda with CPUIDLE enabled, a flood of below messages thrown. [ 3.597930] hw-breakpoint: CPU 0 failed to disable vector catch [ 3.597991] hw-breakpoint: CPU 1 failed to disable vector catch So restore that check back to avoid the mentioned issue. Cc: Dietmar Eggemann Cc: Will Deacon Reported-by: Santosh Shilimkar Acked-by: Santosh Shilimkar Signed-off-by: Lokesh Vutla --- arch/arm/kernel/hw_breakpoint.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 1fd749ee4a1..1b803117ed9 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -1049,7 +1049,8 @@ static struct notifier_block dbg_cpu_pm_nb = { static void __init pm_init(void) { - cpu_pm_register_notifier(&dbg_cpu_pm_nb); + if (has_ossr) + cpu_pm_register_notifier(&dbg_cpu_pm_nb); } #else static inline void pm_init(void) -- cgit v1.2.3 From 1dfb365825bf076bc089b8198667c384a8ab89d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 16:59:47 +0200 Subject: genirq: Add default affinity mask command line option If we isolate CPUs, then we don't want random device interrupts on them. Even w/o the user space irq balancer enabled we can end up with irqs on non boot cpus. Allow to restrict the default irq affinity mask. Signed-off-by: Thomas Gleixner --- Documentation/kernel-parameters.txt | 9 +++++++++ kernel/irq/irqdesc.c | 21 +++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2fe6e767b3d..a2a5f4111a3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1240,6 +1240,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. See comment before ip2_setup() in drivers/char/ip2/ip2base.c. + irqaffinity= [SMP] Set the default irq affinity mask + Format: + ,..., + or + - + (must be a positive range in ascending order) + or a mixture + ,...,- + irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 192a302d6cf..473b2b6eccb 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -23,10 +23,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) -- cgit v1.2.3 From 0841c6ae0b53d43e4634cf4a1f88407b93c15399 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Sep 2012 13:27:51 -0700 Subject: sched: implement usage tracking With the frame-work for runnable tracking now fully in place. Per-entity usage tracking is a simple and low-overhead addition. Signed-off-by: Paul Turner Reviewed-by: Ben Segall --- include/linux/sched.h | 1 + kernel/sched/debug.c | 3 +++ kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++----- kernel/sched/sched.h | 4 ++-- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 178a8d909f1..fe752ca19a8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -931,6 +931,7 @@ struct sched_avg { u64 last_runnable_update; s64 decay_count; unsigned long load_avg_contrib; + u32 usage_avg_sum; }; #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 75024a67352..fbd8caa83ef 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); + P(se->avg.usage_avg_sum); P(se->avg.load_avg_contrib); P(se->avg.decay_count); #endif @@ -223,6 +224,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->tg_runnable_contrib); SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", atomic_read(&cfs_rq->tg->runnable_avg)); + SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg", + atomic_read(&cfs_rq->tg->usage_avg)); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c..98979ddf540 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1231,7 +1231,8 @@ static u32 __compute_runnable_contrib(u64 n) */ static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, - int runnable) + int runnable, + int running) { u64 delta, periods; u32 runnable_contrib; @@ -1270,6 +1271,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, delta_w = 1024 - delta_w; if (runnable) sa->runnable_avg_sum += delta_w; + if (running) + sa->usage_avg_sum += delta_w; sa->runnable_avg_period += delta_w; delta -= delta_w; @@ -1282,17 +1285,22 @@ static __always_inline int __update_entity_runnable_avg(u64 now, periods + 1); sa->runnable_avg_period = decay_load(sa->runnable_avg_period, periods + 1); + sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); if (runnable) sa->runnable_avg_sum += runnable_contrib; + if (running) + sa->usage_avg_sum += runnable_contrib; sa->runnable_avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) sa->runnable_avg_sum += delta; + if (running) + sa->usage_avg_sum += delta; sa->runnable_avg_period += delta; return decayed; @@ -1338,16 +1346,28 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; - long contrib; + long contrib, usage_contrib; /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { + usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + usage_contrib -= cfs_rq->tg_usage_contrib; + + /* + * contrib/usage at this point represent deltas, only update if they + * are substantive. + */ + if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) || + (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) { atomic_add(contrib, &tg->runnable_avg); cfs_rq->tg_runnable_contrib += contrib; + + atomic_add(usage_contrib, &tg->usage_avg); + cfs_rq->tg_usage_contrib += usage_contrib; } } @@ -1453,7 +1473,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se)) return; contrib_delta = __update_entity_load_avg_contrib(se); @@ -1497,7 +1518,8 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, + runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } @@ -1886,6 +1908,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d615..e1d155c68df 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -142,7 +142,7 @@ struct task_group { atomic_t load_weight; atomic64_t load_avg; - atomic_t runnable_avg; + atomic_t runnable_avg, usage_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -279,7 +279,7 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ /* These always depend on CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED - u32 tg_runnable_contrib; + u32 tg_runnable_contrib, tg_usage_contrib; u64 tg_load_contrib; #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From be6ef1d56e70bfdfd79174d7d23a4b12d5b911ee Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:08 +0100 Subject: sched: entity load-tracking load_avg_ratio This patch adds load_avg_ratio to each task. The load_avg_ratio is a variant of load_avg_contrib which is not scaled by the task priority. It is calculated like this: runnable_avg_sum * NICE_0_LOAD / (runnable_avg_period + 1). Signed-off-by: Morten Rasmussen --- include/linux/sched.h | 1 + kernel/sched/fair.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index fe752ca19a8..207054a9867 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -931,6 +931,7 @@ struct sched_avg { u64 last_runnable_update; s64 decay_count; unsigned long load_avg_contrib; + unsigned long load_avg_ratio; u32 usage_avg_sum; }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 98979ddf540..e6db20a013b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1428,6 +1428,9 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); contrib /= (se->avg.runnable_avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); + contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_ratio = scale_load(contrib); } /* Compute the current contribution to load_avg by se, return any delta */ -- cgit v1.2.3 From 798e82cab1a39f4d75796be024c4d7b08bc062e8 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:09 +0100 Subject: sched: Task placement for heterogeneous systems based on task load-tracking This patch introduces the basic SCHED_HMP infrastructure. Each class of cpus is represented by a hmp_domain and tasks will only be moved between these domains when their load profiles suggest it is beneficial. SCHED_HMP relies heavily on the task load-tracking introduced in Paul Turners fair group scheduling patch set: SCHED_HMP requires that the platform implements arch_get_hmp_domains() which should set up the platform specific list of hmp_domains. It is also assumed that the platform disables SD_LOAD_BALANCE for the appropriate sched_domains. Tasks placement takes place every time a task is to be inserted into a runqueue based on its load history. The task placement decision is based on load thresholds. There are no restrictions on the number of hmp_domains, however, multiple (>2) has not been tested and the up/down migration policy is rather simple. Signed-off-by: Morten Rasmussen --- arch/arm/Kconfig | 17 +++++ include/linux/sched.h | 6 ++ kernel/sched/fair.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 6 ++ 4 files changed, 197 insertions(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 136f263ed47..5fb586f4fcf 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1494,6 +1494,23 @@ config SCHED_SMT MultiThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config DISABLE_CPU_SCHED_DOMAIN_BALANCE + bool "(EXPERIMENTAL) Disable CPU level scheduler load-balancing" + help + Disables scheduler load-balancing at CPU sched domain level. + +config SCHED_HMP + bool "(EXPERIMENTAL) Heterogenous multiprocessor scheduling" + depends on DISABLE_CPU_SCHED_DOMAIN_BALANCE && SCHED_MC && FAIR_GROUP_SCHED && !SCHED_AUTOGROUP + help + Experimental scheduler optimizations for heterogeneous platforms. + Attempts to introspectively select task affinity to optimize power + and performance. Basic support for multiple (>2) cpu types is in place, + but it has only been tested with two types of cpus. + There is currently no support for migration of task groups, hence + !SCHED_AUTOGROUP. Furthermore, normal load-balancing must be disabled + between cpus of different type (DISABLE_CPU_SCHED_DOMAIN_BALANCE). + config HAVE_ARM_SCU bool help diff --git a/include/linux/sched.h b/include/linux/sched.h index 207054a9867..cfb9a2efc21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -885,6 +885,12 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +#ifdef CONFIG_SCHED_HMP +struct hmp_domain { + struct cpumask cpus; + struct list_head hmp_domains; +}; +#endif /* CONFIG_SCHED_HMP */ #else /* CONFIG_SMP */ struct sched_domain_attr; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6db20a013b..7d192bd8242 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3340,6 +3340,125 @@ done: return target; } +#ifdef CONFIG_SCHED_HMP +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * The cpu types are distinguished using a list of hmp_domains + * which each represent one cpu type using a cpumask. + * The list is assumed ordered by compute capacity with the + * fastest domain first. + */ +DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain); + +extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list); + +/* Setup hmp_domains */ +static int __init hmp_cpu_mask_setup(void) +{ + char buf[64]; + struct hmp_domain *domain; + struct list_head *pos; + int dc, cpu; + + pr_debug("Initializing HMP scheduler:\n"); + + /* Initialize hmp_domains using platform code */ + arch_get_hmp_domains(&hmp_domains); + if (list_empty(&hmp_domains)) { + pr_debug("HMP domain list is empty!\n"); + return 0; + } + + /* Print hmp_domains */ + dc = 0; + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + cpulist_scnprintf(buf, 64, &domain->cpus); + pr_debug(" HMP domain %d: %s\n", dc, buf); + + for_each_cpu_mask(cpu, domain->cpus) { + per_cpu(hmp_cpu_domain, cpu) = domain; + } + dc++; + } + + return 1; +} + +/* + * Migration thresholds should be in the range [0..1023] + * hmp_up_threshold: min. load required for migrating tasks to a faster cpu + * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu + * The default values (512, 256) offer good responsiveness, but may need + * tweaking suit particular needs. + */ +unsigned int hmp_up_threshold = 512; +unsigned int hmp_down_threshold = 256; + +static unsigned int hmp_up_migration(int cpu, struct sched_entity *se); +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); + +/* Check if cpu is in fastest hmp_domain */ +static inline unsigned int hmp_cpu_is_fastest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return pos == hmp_domains.next; +} + +/* Check if cpu is in slowest hmp_domain */ +static inline unsigned int hmp_cpu_is_slowest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_is_last(pos, &hmp_domains); +} + +/* Next (slower) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_slower_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->next, struct hmp_domain, hmp_domains); +} + +/* Previous (faster) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_faster_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->prev, struct hmp_domain, hmp_domains); +} + +/* + * Selects a cpu in previous (faster) hmp_domain + * Note that cpumask_any_and() returns the first cpu in the cpumask + */ +static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk, + int cpu) +{ + return cpumask_any_and(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(tsk)); +} + +/* + * Selects a cpu in next (slower) hmp_domain + * Note that cpumask_any_and() returns the first cpu in the cpumask + */ +static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk, + int cpu) +{ + return cpumask_any_and(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(tsk)); +} + +#endif /* CONFIG_SCHED_HMP */ + /* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and @@ -3438,6 +3557,16 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) unlock: rcu_read_unlock(); +#ifdef CONFIG_SCHED_HMP + if (hmp_up_migration(prev_cpu, &p->se)) + return hmp_select_faster_cpu(p, prev_cpu); + if (hmp_down_migration(prev_cpu, &p->se)) + return hmp_select_slower_cpu(p, prev_cpu); + /* Make sure that the task stays in its previous hmp domain */ + if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus)) + return prev_cpu; +#endif + return new_cpu; } @@ -5708,6 +5837,41 @@ need_kick: static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif +#ifdef CONFIG_SCHED_HMP +/* Check if task should migrate to a faster cpu */ +static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + + if (hmp_cpu_is_fastest(cpu)) + return 0; + + if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(p)) + && se->avg.load_avg_ratio > hmp_up_threshold) { + return 1; + } + return 0; +} + +/* Check if task should migrate to a slower cpu */ +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + + if (hmp_cpu_is_slowest(cpu)) + return 0; + + if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p)) + && se->avg.load_avg_ratio < hmp_down_threshold) { + return 1; + } + return 0; +} + +#endif /* CONFIG_SCHED_HMP */ + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). @@ -6218,6 +6382,10 @@ __init void init_sched_fair_class(void) zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif + +#ifdef CONFIG_SCHED_HMP + hmp_cpu_mask_setup(); +#endif #endif /* SMP */ } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e1d155c68df..2e90c98900d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -642,6 +642,12 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#ifdef CONFIG_SCHED_HMP +static LIST_HEAD(hmp_domains); +DECLARE_PER_CPU(struct hmp_domain *, hmp_cpu_domain); +#define hmp_cpu_domain(cpu) (per_cpu(hmp_cpu_domain, (cpu))) +#endif /* CONFIG_SCHED_HMP */ + #endif /* CONFIG_SMP */ #include "stats.h" -- cgit v1.2.3 From 2dd22b22c95851445c189c3d4708c027aa19cf5f Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:10 +0100 Subject: sched: Forced task migration on heterogeneous systems This patch introduces forced task migration for moving suitable currently running tasks between hmp_domains. Task behaviour is likely to change over time. Tasks running in a less capable hmp_domain may change to become more demanding and should therefore be migrated up. They are unlikely go through the select_task_rq_fair() path anytime soon and therefore need special attention. This patch introduces a period check (SCHED_TICK) of the currently running task on all runqueues and sets up a forced migration using stop_machine_no_wait() if the task needs to be migrated. Ideally, this should not be implemented by polling all runqueues. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 3 + 2 files changed, 198 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d192bd8242..2d88b043c9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4100,7 +4100,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 1) task is cache cold, or * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -5870,6 +5869,199 @@ static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) return 0; } +/* + * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + * Ideally this function should be merged with can_migrate_task() to avoid + * redundant code. + */ +static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + int tsk_cache_hot = 0; + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed + */ + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + return 0; + } + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + + return 1; +} + +/* + * move_specific_task tries to move a specific task. + * Returns 1 if successful and 0 otherwise. + * Called with both runqueues locked. + */ +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + struct task_struct *p, *n; + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + continue; + + if (!hmp_can_migrate_task(p, env)) + continue; + /* Check if we found the right task */ + if (p != pm) + continue; + + move_task(p, env); + /* + * Right now, this is only the third place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; + } + return 0; +} + +/* + * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to + * migrate a specific task from one runqueue to another. + * hmp_force_up_migration uses this to push a currently running task + * off a runqueue. + * Based on active_load_balance_stop_cpu and can potentially be merged. + */ +static int hmp_active_task_migration_cpu_stop(void *data) +{ + struct rq *busiest_rq = data; + struct task_struct *p = busiest_rq->migrate_task; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct sched_domain *sd; + + raw_spin_lock_irq(&busiest_rq->lock); + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) { + goto out_unlock; + } + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + goto out_unlock; + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, sd) { + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_specific_task(&env, p)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + rcu_read_unlock(); + double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; +} + +static DEFINE_SPINLOCK(hmp_force_migration); + +/* + * hmp_force_up_migration checks runqueues for tasks that need to + * be actively migrated to a faster cpu. + */ +static void hmp_force_up_migration(int this_cpu) +{ + int cpu; + struct sched_entity *curr; + struct rq *target; + unsigned long flags; + unsigned int force; + struct task_struct *p; + + if (!spin_trylock(&hmp_force_migration)) + return; + for_each_online_cpu(cpu) { + force = 0; + target = cpu_rq(cpu); + raw_spin_lock_irqsave(&target->lock, flags); + curr = target->cfs.curr; + if (!curr || !entity_is_task(curr)) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + p = task_of(curr); + if (hmp_up_migration(cpu, curr)) { + if (!target->active_balance) { + target->active_balance = 1; + target->push_cpu = hmp_select_faster_cpu(p, cpu); + target->migrate_task = p; + force = 1; + } + } + raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + spin_unlock(&hmp_force_migration); +} +#else +static void hmp_force_up_migration(int this_cpu) { } #endif /* CONFIG_SCHED_HMP */ /* @@ -5883,6 +6075,8 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; + hmp_force_up_migration(this_cpu); + rebalance_domains(this_cpu, idle); /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e90c98900d..27f51ac8670 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -464,6 +464,9 @@ struct rq { int active_balance; int push_cpu; struct cpu_stop_work active_balance_work; +#ifdef CONFIG_SCHED_HMP + struct task_struct *migrate_task; +#endif /* cpu of this runqueue: */ int cpu; int online; -- cgit v1.2.3 From 943106d9437fcced79c4e48ed794410e5f750b4c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:11 +0100 Subject: sched: Introduce priority-based task migration filter Introduces a priority threshold which prevents low priority task from migrating to faster hmp_domains (cpus). This is useful for user-space software which assigns lower task priority to background task. Signed-off-by: Morten Rasmussen --- arch/arm/Kconfig | 13 +++++++++++++ kernel/sched/fair.c | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 5fb586f4fcf..6157ed8e0e3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1511,6 +1511,19 @@ config SCHED_HMP !SCHED_AUTOGROUP. Furthermore, normal load-balancing must be disabled between cpus of different type (DISABLE_CPU_SCHED_DOMAIN_BALANCE). +config SCHED_HMP_PRIO_FILTER + bool "(EXPERIMENTAL) Filter HMP migrations by task priority" + depends on SCHED_HMP + help + Enables task priority based HMP migration filter. Any task with + a NICE value above the threshold will always be on low-power cpus + with less compute capacity. + +config SCHED_HMP_PRIO_FILTER_VAL + int "NICE priority threshold" + default 5 + depends on SCHED_HMP_PRIO_FILTER + config HAVE_ARM_SCU bool help diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2d88b043c9e..a6b451d9cd3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3392,9 +3392,14 @@ static int __init hmp_cpu_mask_setup(void) * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu * The default values (512, 256) offer good responsiveness, but may need * tweaking suit particular needs. + * + * hmp_up_prio: Only up migrate task with high priority (prio >= hmp_up_prio) + return 0; +#endif + if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, tsk_cpus_allowed(p)) && se->avg.load_avg_ratio > hmp_up_threshold) { @@ -5861,6 +5872,12 @@ static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) if (hmp_cpu_is_slowest(cpu)) return 0; +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if (p->prio >= hmp_up_prio) + return 1; +#endif + if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus, tsk_cpus_allowed(p)) && se->avg.load_avg_ratio < hmp_down_threshold) { -- cgit v1.2.3 From d278bb1c4d5191e0d9b9911337e3b31a100a7f9f Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:12 +0100 Subject: ARM: Add HMP scheduling support for ARM architecture Adds Kconfig entries to enable HMP scheduling on ARM platforms. Currently, it disables CPU level sched_domain load-balacing in order to simplify things. This needs fixing in a later revision. HMP scheduling will do the load-balancing at this level instead. Signed-off-by: Morten Rasmussen --- arch/arm/Kconfig | 14 ++++++++++++++ arch/arm/include/asm/topology.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 6157ed8e0e3..99d41f9633b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1524,6 +1524,20 @@ config SCHED_HMP_PRIO_FILTER_VAL default 5 depends on SCHED_HMP_PRIO_FILTER +config HMP_FAST_CPU_MASK + string "HMP scheduler fast CPU mask" + depends on SCHED_HMP + help + Specify the cpuids of the fast CPUs in the system as a list string, + e.g. cpuid 0+1 should be specified as 0-1. + +config HMP_SLOW_CPU_MASK + string "HMP scheduler slow CPU mask" + depends on SCHED_HMP + help + Specify the cpuids of the slow CPUs in the system as a list string, + e.g. cpuid 0+1 should be specified as 0-1. + config HAVE_ARM_SCU bool help diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 58b8b84adcd..5692ba11322 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -27,6 +27,37 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE +/* Common values for CPUs */ +#ifndef SD_CPU_INIT +#define SD_CPU_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .idle_idx = 1, \ + .newidle_idx = 0, \ + .wake_idx = 0, \ + .forkexec_idx = 0, \ + \ + .flags = 0*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_BALANCE_WAKE \ + | 1*SD_WAKE_AFFINE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ +} +#endif +#endif /* CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE */ + #else static inline void init_cpu_topology(void) { } -- cgit v1.2.3 From dc68bd92107d8990f4608d8f42744770fe203f7f Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:13 +0100 Subject: ARM: sched: Use device-tree to provide fast/slow CPU list for HMP We can't rely on Kconfig options to set the fast and slow CPU lists for HMP scheduling if we want a single kernel binary to support multiple devices with different CPU topology. E.g. TC2 (ARM's Test-Chip-2 big.LITTLE system), Fast Models, or even non big.LITTLE devices. This patch adds the function arch_get_fast_and_slow_cpus() to generate the lists at run-time by parsing the CPU nodes in device-tree; it assumes slow cores are A7s and everything else is fast. The function still supports the old Kconfig options as this is useful for testing the HMP scheduler on devices without big.LITTLE. This patch is reuse of a patch by Jon Medhurst with a few bits left out. Signed-off-by: Morten Rasmussen --- arch/arm/Kconfig | 4 ++- arch/arm/kernel/topology.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 99d41f9633b..bc19fa61b90 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1528,13 +1528,15 @@ config HMP_FAST_CPU_MASK string "HMP scheduler fast CPU mask" depends on SCHED_HMP help - Specify the cpuids of the fast CPUs in the system as a list string, + Leave empty to use device tree information. + Specify the cpuids of the fast CPUs in the system as a list string, e.g. cpuid 0+1 should be specified as 0-1. config HMP_SLOW_CPU_MASK string "HMP scheduler slow CPU mask" depends on SCHED_HMP help + Leave empty to use device tree information. Specify the cpuids of the slow CPUs in the system as a list string, e.g. cpuid 0+1 should be specified as 0-1. diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index c5a59546a25..e6e430a567c 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -289,6 +289,75 @@ void store_cpu_topology(unsigned int cpuid) cpu_topology[cpuid].socket_id, mpidr); } + +#ifdef CONFIG_SCHED_HMP + +static const char * const little_cores[] = { + "arm,cortex-a7", + NULL, +}; + +static bool is_little_cpu(struct device_node *cn) +{ + const char * const *lc; + for (lc = little_cores; *lc; lc++) + if (of_device_is_compatible(cn, *lc)) + return true; + return false; +} + +void __init arch_get_fast_and_slow_cpus(struct cpumask *fast, + struct cpumask *slow) +{ + struct device_node *cn = NULL; + int cpu = 0; + + cpumask_clear(fast); + cpumask_clear(slow); + + /* + * Use the config options if they are given. This helps testing + * HMP scheduling on systems without a big.LITTLE architecture. + */ + if (strlen(CONFIG_HMP_FAST_CPU_MASK) && strlen(CONFIG_HMP_SLOW_CPU_MASK)) { + if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, fast)) + WARN(1, "Failed to parse HMP fast cpu mask!\n"); + if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, slow)) + WARN(1, "Failed to parse HMP slow cpu mask!\n"); + return; + } + + /* + * Else, parse device tree for little cores. + */ + while ((cn = of_find_node_by_type(cn, "cpu"))) { + + if (cpu >= num_possible_cpus()) + break; + + if (is_little_cpu(cn)) + cpumask_set_cpu(cpu, slow); + else + cpumask_set_cpu(cpu, fast); + + cpu++; + } + + if (!cpumask_empty(fast) && !cpumask_empty(slow)) + return; + + /* + * We didn't find both big and little cores so let's call all cores + * fast as this will keep the system running, with all cores being + * treated equal. + */ + cpumask_setall(fast); + cpumask_clear(slow); +} + +#endif /* CONFIG_SCHED_HMP */ + + /* * init_cpu_topology is called at boot when only one cpu is running * which prevent simultaneous write access to cpu_topology array -- cgit v1.2.3 From 1baaccf456ece33b8fa02f8cdf3977d6a95b393c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:14 +0100 Subject: ARM: sched: Setup SCHED_HMP domains SCHED_HMP requires the different cpu types to be represented by an ordered list of hmp_domains. Each hmp_domain represents all cpus of a particular type using a cpumask. The list is platform specific and therefore must be generated by platform code by implementing arch_get_hmp_domains(). Signed-off-by: Morten Rasmussen --- arch/arm/kernel/topology.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index e6e430a567c..627e14e44af 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -355,6 +355,28 @@ void __init arch_get_fast_and_slow_cpus(struct cpumask *fast, cpumask_clear(slow); } +void __init arch_get_hmp_domains(struct list_head *hmp_domains_list) +{ + struct cpumask hmp_fast_cpu_mask; + struct cpumask hmp_slow_cpu_mask; + struct hmp_domain *domain; + + arch_get_fast_and_slow_cpus(&hmp_fast_cpu_mask, &hmp_slow_cpu_mask); + + /* + * Initialize hmp_domains + * Must be ordered with respect to compute capacity. + * Fastest domain at head of list. + */ + domain = (struct hmp_domain *) + kmalloc(sizeof(struct hmp_domain), GFP_KERNEL); + cpumask_copy(&domain->cpus, &hmp_slow_cpu_mask); + list_add(&domain->hmp_domains, hmp_domains_list); + domain = (struct hmp_domain *) + kmalloc(sizeof(struct hmp_domain), GFP_KERNEL); + cpumask_copy(&domain->cpus, &hmp_fast_cpu_mask); + list_add(&domain->hmp_domains, hmp_domains_list); +} #endif /* CONFIG_SCHED_HMP */ -- cgit v1.2.3 From b9d3d5612899de4f8372ecfbc4c8f4ba5aa170ec Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:15 +0100 Subject: sched: Add ftrace events for entity load-tracking Adds ftrace events for key variables related to the entity load-tracking to help debugging scheduler behaviour. Allows tracing of load contribution and runqueue residency ratio for both entities and runqueues as well as entity CPU usage ratio. Signed-off-by: Morten Rasmussen --- include/trace/events/sched.h | 125 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 7 +++ 2 files changed, 132 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index e5586caff67..496445daa54 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -430,6 +430,131 @@ TRACE_EVENT(sched_pi_setprio, __entry->oldprio, __entry->newprio) ); +/* + * Tracepoint for showing tracked load contribution. + */ +TRACE_EVENT(sched_task_load_contrib, + + TP_PROTO(struct task_struct *tsk, unsigned long load_contrib), + + TP_ARGS(tsk, load_contrib), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned long, load_contrib) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->load_contrib = load_contrib; + ), + + TP_printk("comm=%s pid=%d load_contrib=%lu", + __entry->comm, __entry->pid, + __entry->load_contrib) +); + +/* + * Tracepoint for showing tracked task runnable ratio [0..1023]. + */ +TRACE_EVENT(sched_task_runnable_ratio, + + TP_PROTO(struct task_struct *tsk, unsigned long ratio), + + TP_ARGS(tsk, ratio), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned long, ratio) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->ratio = ratio; + ), + + TP_printk("comm=%s pid=%d ratio=%lu", + __entry->comm, __entry->pid, + __entry->ratio) +); + +/* + * Tracepoint for showing tracked rq runnable ratio [0..1023]. + */ +TRACE_EVENT(sched_rq_runnable_ratio, + + TP_PROTO(int cpu, unsigned long ratio), + + TP_ARGS(cpu, ratio), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned long, ratio) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->ratio = ratio; + ), + + TP_printk("cpu=%d ratio=%lu", + __entry->cpu, + __entry->ratio) +); + +/* + * Tracepoint for showing tracked rq runnable load. + */ +TRACE_EVENT(sched_rq_runnable_load, + + TP_PROTO(int cpu, u64 load), + + TP_ARGS(cpu, load), + + TP_STRUCT__entry( + __field(int, cpu) + __field(u64, load) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->load = load; + ), + + TP_printk("cpu=%d load=%llu", + __entry->cpu, + __entry->load) +); + +/* + * Tracepoint for showing tracked task cpu usage ratio [0..1023]. + */ +TRACE_EVENT(sched_task_usage_ratio, + + TP_PROTO(struct task_struct *tsk, unsigned long ratio), + + TP_ARGS(tsk, ratio), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned long, ratio) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->ratio = ratio; + ), + + TP_printk("comm=%s pid=%d ratio=%lu", + __entry->comm, __entry->pid, + __entry->ratio) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a6b451d9cd3..928268df354 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1428,9 +1428,11 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); contrib /= (se->avg.runnable_avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); + trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib); contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD); contrib /= (se->avg.runnable_avg_period + 1); se->avg.load_avg_ratio = scale_load(contrib); + trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio); } /* Compute the current contribution to load_avg by se, return any delta */ @@ -1521,9 +1523,14 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { + u32 contrib; __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); + contrib = rq->avg.runnable_avg_sum * scale_load_down(1024); + contrib /= (rq->avg.runnable_avg_period + 1); + trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib)); + trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg); } /* Add the load generated by se into cfs_rq's child load-average */ -- cgit v1.2.3 From 0d811e649ad31994e8f06b6b18101f249b34e912 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:16 +0100 Subject: sched: Add HMP task migration ftrace event Adds ftrace event for tracing task migrations using HMP optimized scheduling. Signed-off-by: Morten Rasmussen --- include/trace/events/sched.h | 28 ++++++++++++++++++++++++++++ kernel/sched/fair.c | 15 +++++++++++---- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 496445daa54..203e8e9933b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -555,6 +555,34 @@ TRACE_EVENT(sched_task_usage_ratio, __entry->comm, __entry->pid, __entry->ratio) ); + +/* + * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations. + */ +TRACE_EVENT(sched_hmp_migrate, + + TP_PROTO(struct task_struct *tsk, int dest, int force), + + TP_ARGS(tsk, dest, force), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(int, dest) + __field(int, force) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->dest = dest; + __entry->force = force; + ), + + TP_printk("comm=%s pid=%d dest=%d force=%d", + __entry->comm, __entry->pid, + __entry->dest, __entry->force) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 928268df354..ecbeb90adcf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3570,10 +3570,16 @@ unlock: rcu_read_unlock(); #ifdef CONFIG_SCHED_HMP - if (hmp_up_migration(prev_cpu, &p->se)) - return hmp_select_faster_cpu(p, prev_cpu); - if (hmp_down_migration(prev_cpu, &p->se)) - return hmp_select_slower_cpu(p, prev_cpu); + if (hmp_up_migration(prev_cpu, &p->se)) { + new_cpu = hmp_select_faster_cpu(p, prev_cpu); + trace_sched_hmp_migrate(p, new_cpu, 0); + return new_cpu; + } + if (hmp_down_migration(prev_cpu, &p->se)) { + new_cpu = hmp_select_slower_cpu(p, prev_cpu); + trace_sched_hmp_migrate(p, new_cpu, 0); + return new_cpu; + } /* Make sure that the task stays in its previous hmp domain */ if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus)) return prev_cpu; @@ -6074,6 +6080,7 @@ static void hmp_force_up_migration(int this_cpu) target->push_cpu = hmp_select_faster_cpu(p, cpu); target->migrate_task = p; force = 1; + trace_sched_hmp_migrate(p, target->push_cpu, 1); } } raw_spin_unlock_irqrestore(&target->lock, flags); -- cgit v1.2.3 From 76525733b4f4e0fdcc188dfe23941024ae626979 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Sep 2012 14:38:17 +0100 Subject: sched: SCHED_HMP multi-domain task migration control We need a way to prevent tasks that are migrating up and down the hmp_domains from migrating straight on through before the load has adapted to the new compute capacity of the CPU on the new hmp_domain. This patch adds a next up/down migration delay that prevents the task from doing another migration in the same direction until the delay has expired. Signed-off-by: Morten Rasmussen --- include/linux/sched.h | 4 ++++ kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index cfb9a2efc21..5e903596e48 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -938,6 +938,10 @@ struct sched_avg { s64 decay_count; unsigned long load_avg_contrib; unsigned long load_avg_ratio; +#ifdef CONFIG_SCHED_HMP + u64 hmp_last_up_migration; + u64 hmp_last_down_migration; +#endif u32 usage_avg_sum; }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e8b335016c5..e45295dc33a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1617,6 +1617,10 @@ static void __sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; +#ifdef CONFIG_SCHED_HMP + p->se.avg.hmp_last_up_migration = 0; + p->se.avg.hmp_last_down_migration = 0; +#endif #endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecbeb90adcf..fe2408f25e2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3401,12 +3401,16 @@ static int __init hmp_cpu_mask_setup(void) * tweaking suit particular needs. * * hmp_up_prio: Only up migrate task with high priority (cfs; + + se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq); + se->avg.hmp_last_down_migration = 0; +} + +static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + + se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq); + se->avg.hmp_last_up_migration = 0; +} #endif /* CONFIG_SCHED_HMP */ /* @@ -3572,11 +3591,13 @@ unlock: #ifdef CONFIG_SCHED_HMP if (hmp_up_migration(prev_cpu, &p->se)) { new_cpu = hmp_select_faster_cpu(p, prev_cpu); + hmp_next_up_delay(&p->se, new_cpu); trace_sched_hmp_migrate(p, new_cpu, 0); return new_cpu; } if (hmp_down_migration(prev_cpu, &p->se)) { new_cpu = hmp_select_slower_cpu(p, prev_cpu); + hmp_next_down_delay(&p->se, new_cpu); trace_sched_hmp_migrate(p, new_cpu, 0); return new_cpu; } @@ -5859,6 +5880,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) { struct task_struct *p = task_of(se); + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + u64 now; if (hmp_cpu_is_fastest(cpu)) return 0; @@ -5869,6 +5892,12 @@ static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) return 0; #endif + /* Let the task load settle before doing another up migration */ + now = cfs_rq_clock_task(cfs_rq); + if (((now - se->avg.hmp_last_up_migration) >> 10) + < hmp_next_up_threshold) + return 0; + if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, tsk_cpus_allowed(p)) && se->avg.load_avg_ratio > hmp_up_threshold) { @@ -5881,6 +5910,8 @@ static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) { struct task_struct *p = task_of(se); + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + u64 now; if (hmp_cpu_is_slowest(cpu)) return 0; @@ -5891,6 +5922,12 @@ static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) return 1; #endif + /* Let the task load settle before doing another down migration */ + now = cfs_rq_clock_task(cfs_rq); + if (((now - se->avg.hmp_last_down_migration) >> 10) + < hmp_next_down_threshold) + return 0; + if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus, tsk_cpus_allowed(p)) && se->avg.load_avg_ratio < hmp_down_threshold) { @@ -6081,6 +6118,7 @@ static void hmp_force_up_migration(int this_cpu) target->migrate_task = p; force = 1; trace_sched_hmp_migrate(p, target->push_cpu, 1); + hmp_next_up_delay(&p->se, target->push_cpu); } } raw_spin_unlock_irqrestore(&target->lock, flags); -- cgit v1.2.3 From 1b8ae251638844173bd04a4c9e543581f3d92fbd Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 10 Oct 2012 14:51:25 +0100 Subject: sched: Enable HMP priority filter by default This updates the ARM Kconfig to enable the HMP priority filter by default. Signed-off-by: Morten Rasmussen --- arch/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index bc19fa61b90..975b69596e9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1514,6 +1514,7 @@ config SCHED_HMP config SCHED_HMP_PRIO_FILTER bool "(EXPERIMENTAL) Filter HMP migrations by task priority" depends on SCHED_HMP + default y help Enables task priority based HMP migration filter. Any task with a NICE value above the threshold will always be on low-power cpus -- cgit v1.2.3 From 362036513b1dff299b2035d5b928a203742b98d7 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Fri, 12 Oct 2012 13:45:35 +0100 Subject: ARM: sched: Avoid empty 'slow' HMP domain On homogeneous (non-heterogeneous) systems all CPUs will be declared 'fast' and the slow cpu list will be empty. In this situation we need to avoid adding an empty slow HMP domain otherwise the scheduler code will blow up when it attempts to move a task to the slow domain. Signed-off-by: Jon Medhurst --- arch/arm/kernel/topology.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 627e14e44af..0d406ff2e5b 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -368,10 +368,12 @@ void __init arch_get_hmp_domains(struct list_head *hmp_domains_list) * Must be ordered with respect to compute capacity. * Fastest domain at head of list. */ - domain = (struct hmp_domain *) - kmalloc(sizeof(struct hmp_domain), GFP_KERNEL); - cpumask_copy(&domain->cpus, &hmp_slow_cpu_mask); - list_add(&domain->hmp_domains, hmp_domains_list); + if(!cpumask_empty(&hmp_slow_cpu_mask)) { + domain = (struct hmp_domain *) + kmalloc(sizeof(struct hmp_domain), GFP_KERNEL); + cpumask_copy(&domain->cpus, &hmp_slow_cpu_mask); + list_add(&domain->hmp_domains, hmp_domains_list); + } domain = (struct hmp_domain *) kmalloc(sizeof(struct hmp_domain), GFP_KERNEL); cpumask_copy(&domain->cpus, &hmp_fast_cpu_mask); -- cgit v1.2.3 From eeebbf595c8dcd6392537c4d13b8cda78001f4e5 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 12 Oct 2012 15:25:02 +0100 Subject: sched: Only down migrate low priority tasks if allowed by affinity mask Adds an extra check intersection of the task affinity mask and the slower hmp_domain cpumask before down migrating low priority tasks. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe2408f25e2..474dc61b3af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5918,8 +5918,11 @@ static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) #ifdef CONFIG_SCHED_HMP_PRIO_FILTER /* Filter by task priority */ - if (p->prio >= hmp_up_prio) + if ((p->prio >= hmp_up_prio) && + cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p))) { return 1; + } #endif /* Let the task load settle before doing another down migration */ -- cgit v1.2.3 From a9f9bca843e44144670c660638274363f34b9847 Mon Sep 17 00:00:00 2001 From: Sudeep KarkadaNagesha Date: Mon, 24 Sep 2012 14:07:20 +0100 Subject: sched: fix arch_get_fast_and_slow_cpus to get logical cpumask correctly The patch "sched: Use device-tree to provide fast/slow CPU list for HMP" depends on the ordering of CPU's in the device tree. It breaks to determine the logical mask correctly if the logical mask of the CPUs differ from physical ordering in the device tree. This patch fix the logic by depending on the mpidr in the device tree and mapping that mpidr to the logical cpu. Signed-off-by: Sudeep KarkadaNagesha Signed-off-by: Liviu Dudau --- arch/arm/kernel/topology.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 0d406ff2e5b..f2ca9e03080 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -23,6 +23,7 @@ #include #include +#include #include /* @@ -310,7 +311,7 @@ void __init arch_get_fast_and_slow_cpus(struct cpumask *fast, struct cpumask *slow) { struct device_node *cn = NULL; - int cpu = 0; + int cpu; cpumask_clear(fast); cpumask_clear(slow); @@ -332,15 +333,26 @@ void __init arch_get_fast_and_slow_cpus(struct cpumask *fast, */ while ((cn = of_find_node_by_type(cn, "cpu"))) { - if (cpu >= num_possible_cpus()) + const u32 *mpidr; + int len; + + mpidr = of_get_property(cn, "reg", &len); + if (!mpidr || len != 4) { + pr_err("* %s missing reg property\n", cn->full_name); + continue; + } + + cpu = get_logical_index(be32_to_cpup(mpidr)); + if (cpu == -EINVAL) { + pr_err("couldn't get logical index for mpidr %x\n", + be32_to_cpup(mpidr)); break; + } if (is_little_cpu(cn)) cpumask_set_cpu(cpu, slow); else cpumask_set_cpu(cpu, fast); - - cpu++; } if (!cpumask_empty(fast) && !cpumask_empty(slow)) -- cgit v1.2.3 From d2c920023cbc456414f8e07ff253a89be535b41b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 16 May 2013 17:48:01 +0100 Subject: sched: Do not ignore grouped tasks during HMP forced migration. If the entity is not a task, it is a cfs group rq. Iterate up to find the task entity. Change-Id: I7cab7aba0798f6f14e38ad32e566d90e5937ffbc Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 474dc61b3af..3866dcc9972 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6109,10 +6109,19 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr; - if (!curr || !entity_is_task(curr)) { + if (!curr) { raw_spin_unlock_irqrestore(&target->lock, flags); continue; } + if (!entity_is_task(curr)) { + struct cfs_rq *cfs_rq; + + cfs_rq = group_cfs_rq(curr); + while (cfs_rq) { + curr = cfs_rq->curr; + cfs_rq = group_cfs_rq(curr); + } + } p = task_of(curr); if (hmp_up_migration(cpu, curr)) { if (!target->active_balance) { -- cgit v1.2.3 From b64cc6f7e54b97536dbecc05d193b31b27feecf1 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 16 May 2013 17:48:24 +0100 Subject: sched: Ignore offline CPUs in HMP migration & load stats Previously, an offline CPU would always appear to have a zero load and this would distort the offload functionality used for balancing big and little domains. Maintain a mask of online CPUs in each domain and use this instead. Change-Id: I639b564b2f40cb659af8ceb8bd37f84b8a1fe323 Signed-off-by: Chris Redpath --- arch/arm/kernel/topology.c | 6 ++++-- include/linux/sched.h | 1 + kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index f2ca9e03080..9047dd1c5a1 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -383,12 +383,14 @@ void __init arch_get_hmp_domains(struct list_head *hmp_domains_list) if(!cpumask_empty(&hmp_slow_cpu_mask)) { domain = (struct hmp_domain *) kmalloc(sizeof(struct hmp_domain), GFP_KERNEL); - cpumask_copy(&domain->cpus, &hmp_slow_cpu_mask); + cpumask_copy(&domain->possible_cpus, &hmp_slow_cpu_mask); + cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus); list_add(&domain->hmp_domains, hmp_domains_list); } domain = (struct hmp_domain *) kmalloc(sizeof(struct hmp_domain), GFP_KERNEL); - cpumask_copy(&domain->cpus, &hmp_fast_cpu_mask); + cpumask_copy(&domain->possible_cpus, &hmp_fast_cpu_mask); + cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus); list_add(&domain->hmp_domains, hmp_domains_list); } #endif /* CONFIG_SCHED_HMP */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5e903596e48..0e2a546cdad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -888,6 +888,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu); #ifdef CONFIG_SCHED_HMP struct hmp_domain { struct cpumask cpus; + struct cpumask possible_cpus; struct list_head hmp_domains; }; #endif /* CONFIG_SCHED_HMP */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3866dcc9972..10e7dbbbf83 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3381,10 +3381,10 @@ static int __init hmp_cpu_mask_setup(void) dc = 0; list_for_each(pos, &hmp_domains) { domain = list_entry(pos, struct hmp_domain, hmp_domains); - cpulist_scnprintf(buf, 64, &domain->cpus); + cpulist_scnprintf(buf, 64, &domain->possible_cpus); pr_debug(" HMP domain %d: %s\n", dc, buf); - for_each_cpu_mask(cpu, domain->cpus) { + for_each_cpu_mask(cpu, domain->possible_cpus) { per_cpu(hmp_cpu_domain, cpu) = domain; } dc++; @@ -3393,6 +3393,35 @@ static int __init hmp_cpu_mask_setup(void) return 1; } +static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu) +{ + struct hmp_domain *domain; + struct list_head *pos; + + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + if(cpumask_test_cpu(cpu, &domain->possible_cpus)) + return domain; + } + return NULL; +} + +static void hmp_online_cpu(int cpu) +{ + struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + + if(domain) + cpumask_set_cpu(cpu, &domain->cpus); +} + +static void hmp_offline_cpu(int cpu) +{ + struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + + if(domain) + cpumask_clear_cpu(cpu, &domain->cpus); +} + /* * Migration thresholds should be in the range [0..1023] * hmp_up_threshold: min. load required for migrating tasks to a faster cpu @@ -6190,11 +6219,17 @@ void trigger_load_balance(struct rq *rq, int cpu) static void rq_online_fair(struct rq *rq) { +#ifdef CONFIG_SCHED_HMP + hmp_online_cpu(rq->cpu); +#endif update_sysctl(); } static void rq_offline_fair(struct rq *rq) { +#ifdef CONFIG_SCHED_HMP + hmp_offline_cpu(rq->cpu); +#endif update_sysctl(); /* Ensure any throttled groups are reachable by pick_next_task */ -- cgit v1.2.3 From 0e48eed05c47aa2e00b772a519b36286e466621e Mon Sep 17 00:00:00 2001 From: Olivier Cozette Date: Wed, 17 Oct 2012 14:30:30 +0100 Subject: ARM: Change load tracking scale using sysfs These functions allow to change the load average period used in the task load average computation through /sys/kernel/hmp/load_avg_period_ms. This period is the time in ms to go from 0 to 0.5 load average while running or the time from 1 to 0.5 while sleeping. The default one used is 32 and gives the same load_avg_ratio computation than without this patch. These functions also allow to change the up and down threshold of HMP using /sys/kernel/hmp/{up,down}_threshold. Both must be between 0 and 1024. The thresholds are divided by 1024 before being compared to the load_avg_ratio. If /sys/kernel/hmp/load_avg_period_ms is 128 and /sys/kernel/hmp/up_threshold is 512, a task will be migrated to a bigger cluster after running for 128ms. Because after load_avg_period_ms the load average is 0.5 and real up_threshold us 512 / 1024 = 0.5. Signed-off-by: Olivier Cozette Signed-off-by: Chris Redpath --- arch/arm/Kconfig | 23 +++++++ kernel/sched/fair.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 204 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 975b69596e9..f0a5a0787ea 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1541,6 +1541,29 @@ config HMP_SLOW_CPU_MASK Specify the cpuids of the slow CPUs in the system as a list string, e.g. cpuid 0+1 should be specified as 0-1. +config HMP_VARIABLE_SCALE + bool "Allows changing the load tracking scale through sysfs" + depends on SCHED_HMP + help + When turned on, this option exports the thresholds and load average + period value for the load tracking patches through sysfs. + The values can be modified to change the rate of load accumulation + and the thresholds used for HMP migration. + The load_avg_period_ms is the time in ms to reach a load average of + 0.5 for an idle task of 0 load average ratio that start a busy loop. + The up_threshold and down_threshold is the value to go to a faster + CPU or to go back to a slower cpu. + The {up,down}_threshold are devided by 1024 before being compared + to the load average. + For examples, with load_avg_period_ms = 128 and up_threshold = 512, + a running task with a load of 0 will be migrated to a bigger CPU after + 128ms, because after 128ms its load_avg_ratio is 0.5 and the real + up_threshold is 0.5. + This patch has the same behavior as changing the Y of the load + average computation to + (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms) + but it remove intermadiate overflows in computation. + config HAVE_ARM_SCU bool help diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10e7dbbbf83..f86399bb8fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -31,6 +31,10 @@ #include #include +#ifdef CONFIG_HMP_VARIABLE_SCALE +#include +#include +#endif #include "sched.h" @@ -1201,8 +1205,10 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -/* - * We can represent the historical contribution to runnable average as the +#ifdef CONFIG_HMP_VARIABLE_SCALE +static u64 hmp_variable_scale_convert(u64 delta); +#endif +/* We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable * history into segments of approximately 1ms (1024us); label the segment that * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. @@ -1239,6 +1245,9 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int delta_w, decayed = 0; delta = now - sa->last_runnable_update; +#ifdef CONFIG_HMP_VARIABLE_SCALE + delta = hmp_variable_scale_convert(delta); +#endif /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. @@ -3517,6 +3526,176 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq); se->avg.hmp_last_up_migration = 0; } + +#ifdef CONFIG_HMP_VARIABLE_SCALE +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * These functions allow to change the growing speed of the load_avg_ratio + * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms + * This can now be changed with /sys/kernel/hmp/load_avg_period_ms. + * + * These functions also allow to change the up and down threshold of HMP + * using /sys/kernel/hmp/{up,down}_threshold. + * Both must be between 0 and 1023. The threshold that is compared + * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024. + * + * For instance, if load_avg_period = 64 and up_threshold = 512, an idle + * task with a load of 0 will reach the threshold after 64ms of busy loop. + * + * Changing load_avg_periods_ms has the same effect than changing the + * default scaling factor Y=1002/1024 in the load_avg_ratio computation to + * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one + * could trigger overflows. + * For instance, with Y = 1023/1024 in __update_task_entity_contrib() + * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);" + * could be overflowed for a weight > 2^12 even is the load_avg_contrib + * should still be a 32bits result. This would not happen by multiplicating + * delta time by 1/22 and setting load_avg_period_ms = 706. + */ + +#define HMP_VARIABLE_SCALE_SHIFT 16ULL +struct hmp_global_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *a, struct attribute *b, + const char *c, size_t count); + int *value; + int (*to_sysfs)(int); + int (*from_sysfs)(int); +}; + +#define HMP_DATA_SYSFS_MAX 3 + +struct hmp_data_struct { + int multiplier; /* used to scale the time delta */ + struct attribute_group attr_group; + struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1]; + struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX]; +} hmp_data; + +/* + * By scaling the delta time it end-up increasing or decrease the + * growing speed of the per entity load_avg_ratio + * The scale factor hmp_data.multiplier is a fixed point + * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT + */ +static u64 hmp_variable_scale_convert(u64 delta) +{ + u64 high = delta >> 32ULL; + u64 low = delta & 0xffffffffULL; + low *= hmp_data.multiplier; + high *= hmp_data.multiplier; + return (low >> HMP_VARIABLE_SCALE_SHIFT) + + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT)); +} + +static ssize_t hmp_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + ssize_t ret = 0; + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + int temp = *(hmp_attr->value); + if (hmp_attr->to_sysfs != NULL) + temp = hmp_attr->to_sysfs(temp); + ret = sprintf(buf, "%d\n", temp); + return ret; +} + +static ssize_t hmp_store(struct kobject *a, struct attribute *attr, + const char *buf, size_t count) +{ + int temp; + ssize_t ret = count; + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + char *str = vmalloc(count + 1); + if (str == NULL) + return -ENOMEM; + memcpy(str, buf, count); + str[count] = 0; + if (sscanf(str, "%d", &temp) < 1) + ret = -EINVAL; + else { + if (hmp_attr->from_sysfs != NULL) + temp = hmp_attr->from_sysfs(temp); + if (temp < 0) + ret = -EINVAL; + else + *(hmp_attr->value) = temp; + } + vfree(str); + return ret; +} + +static int hmp_period_tofrom_sysfs(int value) +{ + return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value; +} + +/* max value for threshold is 1024 */ +static int hmp_theshold_from_sysfs(int value) +{ + if (value > 1024) + return -1; + return value; +} + +static void hmp_attr_add( + const char *name, + int *value, + int (*to_sysfs)(int), + int (*from_sysfs)(int)) +{ + int i = 0; + while (hmp_data.attributes[i] != NULL) { + i++; + if (i >= HMP_DATA_SYSFS_MAX) + return; + } + hmp_data.attr[i].attr.mode = 0644; + hmp_data.attr[i].show = hmp_show; + hmp_data.attr[i].store = hmp_store; + hmp_data.attr[i].attr.name = name; + hmp_data.attr[i].value = value; + hmp_data.attr[i].to_sysfs = to_sysfs; + hmp_data.attr[i].from_sysfs = from_sysfs; + hmp_data.attributes[i] = &hmp_data.attr[i].attr; + hmp_data.attributes[i + 1] = NULL; +} + +static int hmp_attr_init(void) +{ + int ret; + memset(&hmp_data, sizeof(hmp_data), 0); + /* by default load_avg_period_ms == LOAD_AVG_PERIOD + * meaning no change + */ + hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD); + + hmp_attr_add("load_avg_period_ms", + &hmp_data.multiplier, + hmp_period_tofrom_sysfs, + hmp_period_tofrom_sysfs); + hmp_attr_add("up_threshold", + &hmp_up_threshold, + NULL, + hmp_theshold_from_sysfs); + hmp_attr_add("down_threshold", + &hmp_down_threshold, + NULL, + hmp_theshold_from_sysfs); + + hmp_data.attr_group.name = "hmp"; + hmp_data.attr_group.attrs = hmp_data.attributes; + ret = sysfs_create_group(kernel_kobj, + &hmp_data.attr_group); + return 0; +} +late_initcall(hmp_attr_init); +#endif /* CONFIG_HMP_VARIABLE_SCALE */ #endif /* CONFIG_SCHED_HMP */ /* -- cgit v1.2.3 From 71b5dbd6d527d5de8aaef7e1f8658df95caf28aa Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 16 Nov 2012 10:03:00 +0000 Subject: ARM: Experimental Frequency-Invariant Load Scaling Patch Evaluation Patch to investigate using load as a representation of the amount of POTENTIAL cpu compute capacity used rather than a representation of the CURRENT cpu compute capacity. If CPUFreq is enabled, scales load in accordance with frequency. Powersave/performance CPUFreq governors are detected and scaling is disabled while these governors are in use. This is because when a single-frequency governor is in use, potential CPU capacity is static. So long as the governors and CPUFreq subsystem correctly report the frequencies available, the scaling should self tune. Adds an additional file to sysfs to allow this feature to be disabled for experimentation. /sys/kernel/hmp/frequency_invariant_load_scale write 0 to disable, 1 to enable. Signed-off-by: Chris Redpath --- arch/arm/Kconfig | 15 +++ kernel/sched/fair.c | 320 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 305 insertions(+), 30 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f0a5a0787ea..bc79c57d78c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1564,6 +1564,21 @@ config HMP_VARIABLE_SCALE (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms) but it remove intermadiate overflows in computation. +config HMP_FREQUENCY_INVARIANT_SCALE + bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP" + depends on HMP_VARIABLE_SCALE && CPU_FREQ + help + Scales the current load contribution in line with the frequency + of the CPU that the task was executed on. + In this version, we use a simple linear scale derived from the + maximum frequency reported by CPUFreq. + Restricting tracked load to be scaled by the CPU's frequency + represents the consumption of possible compute capacity + (rather than consumption of actual instantaneous capacity as + normal) and allows the HMP migration's simple threshold + migration strategy to interact more predictably with CPUFreq's + asynchronous compute capacity changes. + config HAVE_ARM_SCU bool help diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f86399bb8fe..202bebe3dab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -34,10 +34,17 @@ #ifdef CONFIG_HMP_VARIABLE_SCALE #include #include -#endif +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Include cpufreq header to add a notifier so that cpu frequency + * scaling can track the current CPU frequency + */ +#include +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_HMP_VARIABLE_SCALE */ #include "sched.h" + /* * Targeted preemption latency for CPU-bound tasks: * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) @@ -1206,8 +1213,93 @@ static u32 __compute_runnable_contrib(u64 n) } #ifdef CONFIG_HMP_VARIABLE_SCALE -static u64 hmp_variable_scale_convert(u64 delta); + +#define HMP_VARIABLE_SCALE_SHIFT 16ULL +struct hmp_global_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *a, struct attribute *b, + const char *c, size_t count); + int *value; + int (*to_sysfs)(int); + int (*from_sysfs)(int); +}; + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +#define HMP_DATA_SYSFS_MAX 4 +#else +#define HMP_DATA_SYSFS_MAX 3 +#endif + +struct hmp_data_struct { +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + int freqinvar_load_scale_enabled; #endif + int multiplier; /* used to scale the time delta */ + struct attribute_group attr_group; + struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1]; + struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX]; +} hmp_data; + +static u64 hmp_variable_scale_convert(u64 delta); +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Frequency-Invariant Load Modification: + * Loads are calculated as in PJT's patch however we also scale the current + * contribution in line with the frequency of the CPU that the task was + * executed on. + * In this version, we use a simple linear scale derived from the maximum + * frequency reported by CPUFreq. As an example: + * + * Consider that we ran a task for 100% of the previous interval. + * + * Our CPU was under asynchronous frequency control through one of the + * CPUFreq governors. + * + * The CPUFreq governor reports that it is able to scale the CPU between + * 500MHz and 1GHz. + * + * During the period, the CPU was running at 1GHz. + * + * In this case, our load contribution for that period is calculated as + * 1 * (number_of_active_microseconds) + * + * This results in our task being able to accumulate maximum load as normal. + * + * + * Consider now that our CPU was executing at 500MHz. + * + * We now scale the load contribution such that it is calculated as + * 0.5 * (number_of_active_microseconds) + * + * Our task can only record 50% maximum load during this period. + * + * This represents the task consuming 50% of the CPU's *possible* compute + * capacity. However the task did consume 100% of the CPU's *available* + * compute capacity which is the value seen by the CPUFreq governor and + * user-side CPU Utilization tools. + * + * Restricting tracked load to be scaled by the CPU's frequency accurately + * represents the consumption of possible compute capacity and allows the + * HMP migration's simple threshold migration strategy to interact more + * predictably with CPUFreq's asynchronous compute capacity changes. + */ +#define SCHED_FREQSCALE_SHIFT 10 +struct cpufreq_extents { + u32 curr_scale; + u32 min; + u32 max; + u32 flags; +}; +/* Flag set when the governor in use only allows one frequency. + * Disables scaling. + */ +#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01 + +static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS]; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_HMP_VARIABLE_SCALE */ + /* We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable * history into segments of approximately 1ms (1024us); label the segment that @@ -1238,11 +1330,18 @@ static u64 hmp_variable_scale_convert(u64 delta); static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, int runnable, - int running) + int running, + int cpu) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + u64 scaled_delta; + u32 scaled_runnable_contrib; + int scaled_delta_w; + u32 curr_scale = 1024; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ delta = now - sa->last_runnable_update; #ifdef CONFIG_HMP_VARIABLE_SCALE @@ -1266,6 +1365,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return 0; sa->last_runnable_update = now; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* retrieve scale factor for load */ + if (hmp_data.freqinvar_load_scale_enabled) + curr_scale = freq_scale[cpu].curr_scale; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + /* delta_w is the amount already accumulated against our next period */ delta_w = sa->runnable_avg_period % 1024; if (delta + delta_w >= 1024) { @@ -1278,10 +1383,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, * period and accrue it. */ delta_w = 1024 - delta_w; + /* scale runnable time if necessary */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_delta_w = (delta_w * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_delta_w; + if (running) + sa->usage_avg_sum += scaled_delta_w; +#else if (runnable) sa->runnable_avg_sum += delta_w; if (running) sa->usage_avg_sum += delta_w; +#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += delta_w; delta -= delta_w; @@ -1289,27 +1404,49 @@ static __always_inline int __update_entity_runnable_avg(u64 now, /* Figure out how many additional periods this update spans */ periods = delta / 1024; delta %= 1024; - + /* decay the load we have accumulated so far */ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); sa->runnable_avg_period = decay_load(sa->runnable_avg_period, periods + 1); sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); - + /* add the contribution from this period */ /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); + /* Apply load scaling if necessary. + * Note that multiplying the whole series is same as + * multiplying all terms + */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_runnable_contrib = (runnable_contrib * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_runnable_contrib; + if (running) + sa->usage_avg_sum += scaled_runnable_contrib; +#else if (runnable) sa->runnable_avg_sum += runnable_contrib; if (running) sa->usage_avg_sum += runnable_contrib; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ + /* scale if necessary */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT); + if (runnable) + sa->runnable_avg_sum += scaled_delta; + if (running) + sa->usage_avg_sum += scaled_delta; +#else if (runnable) sa->runnable_avg_sum += delta; if (running) sa->usage_avg_sum += delta; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += delta; return decayed; @@ -1488,7 +1625,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, now = cfs_rq_clock_task(group_cfs_rq(se)); if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, - cfs_rq->curr == se)) + cfs_rq->curr == se, se->cfs_rq->rq->cpu)) return; contrib_delta = __update_entity_load_avg_contrib(se); @@ -1534,7 +1671,7 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { u32 contrib; __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, - runnable); + runnable, rq->cpu); __update_tg_runnable_avg(&rq->avg, &rq->cfs); contrib = rq->avg.runnable_avg_sum * scale_load_down(1024); contrib /= (rq->avg.runnable_avg_period + 1); @@ -3554,27 +3691,6 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) * delta time by 1/22 and setting load_avg_period_ms = 706. */ -#define HMP_VARIABLE_SCALE_SHIFT 16ULL -struct hmp_global_attr { - struct attribute attr; - ssize_t (*show)(struct kobject *kobj, - struct attribute *attr, char *buf); - ssize_t (*store)(struct kobject *a, struct attribute *b, - const char *c, size_t count); - int *value; - int (*to_sysfs)(int); - int (*from_sysfs)(int); -}; - -#define HMP_DATA_SYSFS_MAX 3 - -struct hmp_data_struct { - int multiplier; /* used to scale the time delta */ - struct attribute_group attr_group; - struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1]; - struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX]; -} hmp_data; - /* * By scaling the delta time it end-up increasing or decrease the * growing speed of the per entity load_avg_ratio @@ -3642,7 +3758,15 @@ static int hmp_theshold_from_sysfs(int value) return -1; return value; } - +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* freqinvar control is only 0,1 off/on */ +static int hmp_freqinvar_from_sysfs(int value) +{ + if (value < 0 || value > 1) + return -1; + return value; +} +#endif static void hmp_attr_add( const char *name, int *value, @@ -3687,7 +3811,14 @@ static int hmp_attr_init(void) &hmp_down_threshold, NULL, hmp_theshold_from_sysfs); - +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* default frequency-invariant scaling ON */ + hmp_data.freqinvar_load_scale_enabled = 1; + hmp_attr_add("frequency_invariant_load_scale", + &hmp_data.freqinvar_load_scale_enabled, + NULL, + hmp_freqinvar_from_sysfs); +#endif hmp_data.attr_group.name = "hmp"; hmp_data.attr_group.attrs = hmp_data.attributes; ret = sysfs_create_group(kernel_kobj, @@ -6878,3 +7009,132 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr) +{ + u32 result = curr / max; + return result; +} + +/* Called when the CPU Frequency is changed. + * Once for each CPU. + */ +static int cpufreq_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + int cpu = freq->cpu; + struct cpufreq_extents *extents; + + if (freq->flags & CPUFREQ_CONST_LOOPS) + return NOTIFY_OK; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_OK; + + /* if dynamic load scale is disabled, set the load scale to 1.0 */ + if (!hmp_data.freqinvar_load_scale_enabled) { + freq_scale[cpu].curr_scale = 1024; + return NOTIFY_OK; + } + + extents = &freq_scale[cpu]; + if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) { + /* If our governor was recognised as a single-freq governor, + * use 1.0 + */ + extents->curr_scale = 1024; + } else { + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, freq->new); + } + + return NOTIFY_OK; +} + +/* Called when the CPUFreq governor is changed. + * Only called for the CPUs which are actually changed by the + * userspace. + */ +static int cpufreq_policy_callback(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct cpufreq_policy *policy = data; + struct cpufreq_extents *extents; + int cpu, singleFreq = 0; + static const char performance_governor[] = "performance"; + static const char powersave_governor[] = "powersave"; + + if (event == CPUFREQ_START) + return 0; + + if (event != CPUFREQ_INCOMPATIBLE) + return 0; + + /* CPUFreq governors do not accurately report the range of + * CPU Frequencies they will choose from. + * We recognise performance and powersave governors as + * single-frequency only. + */ + if (!strncmp(policy->governor->name, performance_governor, + strlen(performance_governor)) || + !strncmp(policy->governor->name, powersave_governor, + strlen(powersave_governor))) + singleFreq = 1; + + /* Make sure that all CPUs impacted by this policy are + * updated since we will only get a notification when the + * user explicitly changes the policy on a CPU. + */ + for_each_cpu(cpu, policy->cpus) { + extents = &freq_scale[cpu]; + extents->max = policy->max >> SCHED_FREQSCALE_SHIFT; + extents->min = policy->min >> SCHED_FREQSCALE_SHIFT; + if (!hmp_data.freqinvar_load_scale_enabled) { + extents->curr_scale = 1024; + } else if (singleFreq) { + extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = 1024; + } else { + extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, policy->cur); + } + } + + return 0; +} + +static struct notifier_block cpufreq_notifier = { + .notifier_call = cpufreq_callback, +}; +static struct notifier_block cpufreq_policy_notifier = { + .notifier_call = cpufreq_policy_callback, +}; + +static int __init register_sched_cpufreq_notifier(void) +{ + int ret = 0; + + /* init safe defaults since there are no policies at registration */ + for (ret = 0; ret < CONFIG_NR_CPUS; ret++) { + /* safe defaults */ + freq_scale[ret].max = 1024; + freq_scale[ret].min = 1024; + freq_scale[ret].curr_scale = 1024; + } + + pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n"); + ret = cpufreq_register_notifier(&cpufreq_policy_notifier, + CPUFREQ_POLICY_NOTIFIER); + + if (ret != -EINVAL) + ret = cpufreq_register_notifier(&cpufreq_notifier, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} + +core_initcall(register_sched_cpufreq_notifier); +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ -- cgit v1.2.3 From ae570aeb1d40d531a498e53e2a815a52996f0749 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 20 Nov 2012 11:04:49 +0530 Subject: ARM: Fix build breakage when big.LITTLE.conf is not used. Change-Id: I8641f5e930c65b5672130bd4a18d9868bb3ca594 Signed-off-by: Chris Redpath Signed-off-by: Liviu Dudau --- kernel/sched/fair.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 202bebe3dab..3ad65bc696c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1614,7 +1614,11 @@ static inline void update_entity_load_avg(struct sched_entity *se, struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; u64 now; + int cpu = -1; /* not used in normal case */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + cpu = cfs_rq->rq->cpu; +#endif /* * For a group entity we need to use their owned cfs_rq_clock_task() in * case they are the parent of a throttled hierarchy. @@ -1625,7 +1629,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, now = cfs_rq_clock_task(group_cfs_rq(se)); if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, - cfs_rq->curr == se, se->cfs_rq->rq->cpu)) + cfs_rq->curr == se, cpu)) return; contrib_delta = __update_entity_load_avg_contrib(se); @@ -1670,8 +1674,13 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { u32 contrib; + int cpu = -1; /* not used in normal case */ + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + cpu = rq->cpu; +#endif __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, - runnable, rq->cpu); + runnable, cpu); __update_tg_runnable_avg(&rq->avg, &rq->cfs); contrib = rq->avg.runnable_avg_sum * scale_load_down(1024); contrib /= (rq->avg.runnable_avg_period + 1); -- cgit v1.2.3 From cf71912f481c7b6fc39e9b2021e8f9c058116c26 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 29 Nov 2012 15:41:50 +0000 Subject: sched: Basic global balancing support for HMP This patch introduces an extra-check at task up-migration to prevent overloading the cpus in the faster hmp_domain while the slower hmp_domain is not fully utilized. The patch also introduces a periodic balance check that can down-migrate tasks if the faster domain is oversubscribed and the slower is under-utilized. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3ad65bc696c..e18fac530d3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3836,6 +3836,80 @@ static int hmp_attr_init(void) } late_initcall(hmp_attr_init); #endif /* CONFIG_HMP_VARIABLE_SCALE */ + +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu) +{ + int cpu; + int min_load = INT_MAX; + int min_cpu_temp = NR_CPUS; + + for_each_cpu_mask(cpu, hmpd->cpus) { + if (cpu_rq(cpu)->cfs.tg_load_contrib < min_load) { + min_load = cpu_rq(cpu)->cfs.tg_load_contrib; + min_cpu_temp = cpu; + } + } + + if (min_cpu) + *min_cpu = min_cpu_temp; + + return min_load; +} + +/* + * Calculate the task starvation + * This is the ratio of actually running time vs. runnable time. + * If the two are equal the task is getting the cpu time it needs or + * it is alone on the cpu and the cpu is fully utilized. + */ +static inline unsigned int hmp_task_starvation(struct sched_entity *se) +{ + u32 starvation; + + starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD); + starvation /= (se->avg.runnable_avg_sum + 1); + + return scale_load(starvation); +} + +static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se) +{ + int min_usage; + int dest_cpu = NR_CPUS; + + if (hmp_cpu_is_slowest(cpu)) + return NR_CPUS; + + /* Is the current domain fully loaded? */ + /* load < ~94% */ + min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL); + if (min_usage < NICE_0_LOAD-64) + return NR_CPUS; + + /* Is the cpu oversubscribed? */ + /* load < ~194% */ + if (cpu_rq(cpu)->cfs.tg_load_contrib < 2*NICE_0_LOAD-64) + return NR_CPUS; + + /* Is the task alone on the cpu? */ + if (cpu_rq(cpu)->cfs.nr_running < 2) + return NR_CPUS; + + /* Is the task actually starving? */ + if (hmp_task_starvation(se) > 768) /* <25% waiting */ + return NR_CPUS; + + /* Does the slower domain have spare cycles? */ + min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu); + /* load > 50% */ + if (min_usage > NICE_0_LOAD/2) + return NR_CPUS; + + if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus)) + return dest_cpu; + return NR_CPUS; +} #endif /* CONFIG_SCHED_HMP */ /* @@ -6246,10 +6320,14 @@ static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) < hmp_next_up_threshold) return 0; - if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, - tsk_cpus_allowed(p)) - && se->avg.load_avg_ratio > hmp_up_threshold) { - return 1; + if (se->avg.load_avg_ratio > hmp_up_threshold) { + /* Target domain load < ~94% */ + if (hmp_domain_min_load(hmp_faster_domain(cpu), NULL) + > NICE_0_LOAD-64) + return 0; + if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(p))) + return 1; } return 0; } @@ -6481,6 +6559,21 @@ static void hmp_force_up_migration(int this_cpu) hmp_next_up_delay(&p->se, target->push_cpu); } } + if (!force && !target->active_balance) { + /* + * For now we just check the currently running task. + * Selecting the lightest task for offloading will + * require extensive book keeping. + */ + target->push_cpu = hmp_offload_down(cpu, curr); + if (target->push_cpu < NR_CPUS) { + target->active_balance = 1; + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, target->push_cpu, 2); + hmp_next_down_delay(&p->se, target->push_cpu); + } + } raw_spin_unlock_irqrestore(&target->lock, flags); if (force) stop_one_cpu_nowait(cpu_of(target), -- cgit v1.2.3 From 7e6446630039fcbabb9582ebefdcbc30de32c0e2 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 16 May 2013 17:48:41 +0100 Subject: sched: cfs.nr_running does not contain the intended metric rq->nr_running is the actual number of runnable tasks we wish to use to determine if a task is alone on a CPU. Change-Id: Icaf3022e02924ecdc94e14d4146c6fadd9580e2b Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e18fac530d3..90f61d848cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3893,7 +3893,7 @@ static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se) return NR_CPUS; /* Is the task alone on the cpu? */ - if (cpu_rq(cpu)->cfs.nr_running < 2) + if (cpu_rq(cpu)->nr_running < 2) return NR_CPUS; /* Is the task actually starving? */ -- cgit v1.2.3 From 3231c8ce4b005f79ad27309907adc3de319e341c Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 17 Jun 2013 15:25:51 +0100 Subject: config: Disable priority filtering for HMP Scheduler Android uses threads with very low priority by default to implement AsyncTask APIs. This means that applications making use of these APIs to produce multithreaded code are penalised by not allowing use of big CPUs as necessary. Signed-off-by: Chris Redpath --- linaro/configs/big-LITTLE-MP.conf | 2 -- 1 file changed, 2 deletions(-) diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf index 8cc2be049a4..0bbc603a13e 100644 --- a/linaro/configs/big-LITTLE-MP.conf +++ b/linaro/configs/big-LITTLE-MP.conf @@ -9,5 +9,3 @@ CONFIG_HMP_FAST_CPU_MASK="" CONFIG_HMP_SLOW_CPU_MASK="" CONFIG_HMP_VARIABLE_SCALE=y CONFIG_HMP_FREQUENCY_INVARIANT_SCALE=y -CONFIG_SCHED_HMP_PRIO_FILTER=y -CONFIG_SCHED_HMP_PRIO_FILTER_VAL=5 -- cgit v1.2.3 From 7d252cd22a3f6cb459e8b012912dfd258157f7df Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 26 Jun 2013 17:24:59 +0100 Subject: mm: make vmstat_update periodic run conditional vmstat_update runs every second from the work queue to update statistics and drain per cpu pages back into the global page allocator. This is useful in most circumstances but is wasteful if the CPU doesn't actually make any VM activity. This can happen in the situtation that the CPU is idle or running a CPU bound long term task (e.g. CPU isolation), in which case the periodic vmstate_update timer needlessly itnerrupts the CPU. This patch tries to make vmstat_update schedule itself for the next round only if there was any work for it to do in the previous run. The assumption is that if for a whole second we didn't see any VM activity it is reasnoable to assume that the CPU is not using the VM because it is idle or runs a long term single CPU bound task. A new single unbound system work queue item is scheduled periodically to monitor CPUs that have their vmstat_update work stopped and re-schedule them if VM activity is detected. Signed-off-by: Gilad Ben-Yossef CC: Thomas Gleixner CC: Tejun Heo CC: John Stultz CC: Andrew Morton CC: KOSAKI Motohiro CC: Mel Gorman CC: Mike Frysinger CC: David Rientjes CC: Hugh Dickins CC: Minchan Kim CC: Konstantin Khlebnikov CC: Christoph Lameter CC: Chris Metcalf CC: Hakan Akkan CC: Max Krasnyansky CC: Frederic Weisbecker CC: linux-kernel@vger.kernel.org CC: linux-mm@kvack.org --- include/linux/vmstat.h | 2 +- mm/vmstat.c | 95 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c586679b6fe..a30ab7910ff 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -198,7 +198,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); -void refresh_cpu_vm_stats(int); +bool refresh_cpu_vm_stats(int); void refresh_zone_stat_thresholds(void); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); diff --git a/mm/vmstat.c b/mm/vmstat.c index f42745e6578..b916a43a6b3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -432,11 +433,12 @@ EXPORT_SYMBOL(dec_zone_page_state); * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. */ -void refresh_cpu_vm_stats(int cpu) +bool refresh_cpu_vm_stats(int cpu) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + bool vm_activity = false; for_each_populated_zone(zone) { struct per_cpu_pageset *p; @@ -483,14 +485,21 @@ void refresh_cpu_vm_stats(int cpu) if (p->expire) continue; - if (p->pcp.count) + if (p->pcp.count) { + vm_activity = true; drain_zone_pages(zone, &p->pcp); + } #endif } for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (global_diff[i]) + if (global_diff[i]) { atomic_long_add(global_diff[i], &vm_stat[i]); + vm_activity = true; + } + + return vm_activity; + } /* @@ -1174,22 +1183,72 @@ static const struct file_operations proc_vmstat_file_operations = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; +static struct cpumask vmstat_off_cpus; +struct delayed_work vmstat_monitor_work; -static void vmstat_update(struct work_struct *w) +static inline bool need_vmstat(int cpu) { - refresh_cpu_vm_stats(smp_processor_id()); - schedule_delayed_work(&__get_cpu_var(vmstat_work), - round_jiffies_relative(sysctl_stat_interval)); + struct zone *zone; + int i; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p; + + p = per_cpu_ptr(zone->pageset, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (p->vm_stat_diff[i]) + return true; + + if (zone_to_nid(zone) != numa_node_id() && p->pcp.count) + return true; + } + + return false; } -static void __cpuinit start_cpu_timer(int cpu) +static void vmstat_update(struct work_struct *w); + +static void start_cpu_timer(int cpu) { struct delayed_work *work = &per_cpu(vmstat_work, cpu); - INIT_DEFERRABLE_WORK(work, vmstat_update); + cpumask_clear_cpu(cpu, &vmstat_off_cpus); schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } +static void __cpuinit setup_cpu_timer(int cpu) +{ + struct delayed_work *work = &per_cpu(vmstat_work, cpu); + + INIT_DEFERRABLE_WORK(work, vmstat_update); + start_cpu_timer(cpu); +} + +static void vmstat_update_monitor(struct work_struct *w) +{ + int cpu; + + for_each_cpu_and(cpu, &vmstat_off_cpus, cpu_online_mask) + if (need_vmstat(cpu)) + start_cpu_timer(cpu); + + queue_delayed_work(system_unbound_wq, &vmstat_monitor_work, + round_jiffies_relative(sysctl_stat_interval)); +} + + +static void vmstat_update(struct work_struct *w) +{ + int cpu = smp_processor_id(); + + if (likely(refresh_cpu_vm_stats(cpu))) + schedule_delayed_work(&__get_cpu_var(vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + else + cpumask_set_cpu(cpu, &vmstat_off_cpus); +} + /* * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. @@ -1204,17 +1263,19 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: refresh_zone_stat_thresholds(); - start_cpu_timer(cpu); + setup_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - per_cpu(vmstat_work, cpu).work.func = NULL; + if (!cpumask_test_cpu(cpu, &vmstat_off_cpus)) { + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + per_cpu(vmstat_work, cpu).work.func = NULL; + } break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - start_cpu_timer(cpu); + setup_cpu_timer(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1237,8 +1298,14 @@ static int __init setup_vmstat(void) register_cpu_notifier(&vmstat_notifier); + INIT_DEFERRABLE_WORK(&vmstat_monitor_work, + vmstat_update_monitor); + queue_delayed_work(system_unbound_wq, + &vmstat_monitor_work, + round_jiffies_relative(HZ)); + for_each_online_cpu(cpu) - start_cpu_timer(cpu); + setup_cpu_timer(cpu); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); -- cgit v1.2.3 From 7362251d8a422dcba5c56408b92fc2b6ad03b10c Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 17 Jun 2013 15:22:58 +0100 Subject: Revert "sched: Enable HMP priority filter by default" This reverts commit 68315334e32932739145ddb41a46cc86b8b056b3. Having the priority filter enabled prevents proper operation on Android systems where a wider range of priorities are used by userspace to partition types of tasks. Those tasks should still be able to benefit from the use of big CPUs when required. Signed-off-by: Chris Redpath --- arch/arm/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index bc79c57d78c..2a5f5b8c385 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1514,7 +1514,6 @@ config SCHED_HMP config SCHED_HMP_PRIO_FILTER bool "(EXPERIMENTAL) Filter HMP migrations by task priority" depends on SCHED_HMP - default y help Enables task priority based HMP migration filter. Any task with a NICE value above the threshold will always be on low-power cpus -- cgit v1.2.3 From ede58a69a32b187899e6cccbbd299a04d3f50b71 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 17 Jun 2013 15:48:15 +0100 Subject: HMP: Use unweighted load for hmp migration decisions Normal task and runqueue loading is scaled according to priority to end up with a weighted load, known as the contribution. We want the CPU time to be allotted according to priority, but we also want to make big/little decisions based upon raw load. It is common, for example, for Android apps following the dev guide to end up with all their long-running or async action threads as low priority unless they override the AsyncThread constructor. All these threads are such low priority that they become invisible to the hmp_offload routine. Using unweighted load here allows us to maximise CPU usage in busy situations. Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 90f61d848cb..a90a63807cf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3841,20 +3841,24 @@ static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, int *min_cpu) { int cpu; - int min_load = INT_MAX; - int min_cpu_temp = NR_CPUS; + int min_cpu_runnable_temp = NR_CPUS; + unsigned long min_runnable_load = INT_MAX; + unsigned long contrib; for_each_cpu_mask(cpu, hmpd->cpus) { - if (cpu_rq(cpu)->cfs.tg_load_contrib < min_load) { - min_load = cpu_rq(cpu)->cfs.tg_load_contrib; - min_cpu_temp = cpu; + /* don't use the divisor in the loop, just at the end */ + contrib = cpu_rq(cpu)->avg.runnable_avg_sum * scale_load_down(1024); + if (contrib < min_runnable_load) { + min_runnable_load = contrib; + min_cpu_runnable_temp = cpu; } } if (min_cpu) - *min_cpu = min_cpu_temp; + *min_cpu = min_cpu_runnable_temp; - return min_load; + /* domain will often have at least one empty CPU */ + return min_runnable_load ? min_runnable_load / (LOAD_AVG_MAX + 1) : 0; } /* @@ -3882,22 +3886,18 @@ static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se) return NR_CPUS; /* Is the current domain fully loaded? */ - /* load < ~94% */ + /* load < ~50% */ min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL); - if (min_usage < NICE_0_LOAD-64) - return NR_CPUS; - - /* Is the cpu oversubscribed? */ - /* load < ~194% */ - if (cpu_rq(cpu)->cfs.tg_load_contrib < 2*NICE_0_LOAD-64) + if (min_usage < (NICE_0_LOAD>>1)) return NR_CPUS; /* Is the task alone on the cpu? */ - if (cpu_rq(cpu)->nr_running < 2) + if (cpu_rq(cpu)->cfs.nr_running < 2) return NR_CPUS; /* Is the task actually starving? */ - if (hmp_task_starvation(se) > 768) /* <25% waiting */ + /* >=25% ratio running/runnable = starving */ + if (hmp_task_starvation(se) > 768) return NR_CPUS; /* Does the slower domain have spare cycles? */ @@ -3908,6 +3908,7 @@ static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se) if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus)) return dest_cpu; + return NR_CPUS; } #endif /* CONFIG_SCHED_HMP */ -- cgit v1.2.3 From 08d7db89a214a138516419a85e17272b09180abd Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 9 May 2013 16:21:15 +0100 Subject: HMP: Select least-loaded CPU when performing HMP Migrations The reference patch set always selects the first CPU in an HMP domain as a migration target. In busy situations, this means that the migrated thread cannot make immediate use of an idle CPU but must share a busy one until the load balancer runs across the big domain. This patch uses the hmp_domain_min_load function introduced in global balancing to figure out which of the CPUs is the least busy and selects that as a migration target - in both directions. This essentially implements a task-spread strategy and is intended to maximise performance of migrated threads but is likely to use more power than the packing strategy previously employed. Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a90a63807cf..d7303c5fd1e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3598,6 +3598,8 @@ unsigned int hmp_next_down_threshold = 4096; static unsigned int hmp_up_migration(int cpu, struct sched_entity *se); static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu); /* Check if cpu is in fastest hmp_domain */ static inline unsigned int hmp_cpu_is_fastest(int cpu) @@ -3642,7 +3644,16 @@ static inline struct hmp_domain *hmp_faster_domain(int cpu) static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk, int cpu) { - return cpumask_any_and(&hmp_faster_domain(cpu)->cpus, + int lowest_cpu=NR_CPUS; + __always_unused int lowest_ratio = hmp_domain_min_load(hmp_faster_domain(cpu), &lowest_cpu); + /* + * If the lowest-loaded CPU in the domain is allowed by the task affinity + * select that one, otherwise select one which is allowed + */ + if(lowest_cpu != NR_CPUS && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk))) + return lowest_cpu; + else + return cpumask_any_and(&hmp_faster_domain(cpu)->cpus, tsk_cpus_allowed(tsk)); } @@ -3653,7 +3664,16 @@ static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk, static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk, int cpu) { - return cpumask_any_and(&hmp_slower_domain(cpu)->cpus, + int lowest_cpu=NR_CPUS; + __always_unused int lowest_ratio = hmp_domain_min_load(hmp_slower_domain(cpu), &lowest_cpu); + /* + * If the lowest-loaded CPU in the domain is allowed by the task affinity + * select that one, otherwise select one which is allowed + */ + if(lowest_cpu != NR_CPUS && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk))) + return lowest_cpu; + else + return cpumask_any_and(&hmp_slower_domain(cpu)->cpus, tsk_cpus_allowed(tsk)); } -- cgit v1.2.3 From 3f3b210703f80fe60dbfa13c25b30d4effbf9f4b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 9 May 2013 16:21:29 +0100 Subject: HMP: Avoid multiple calls to hmp_domain_min_load in fast path When evaluating a migration we make two calls to hmp_domain_min_load. This is unnecessary if we pass on the target CPU information from the hmp_up_migration path. In hmp_down_migration, we don't consider the load of the target CPUS. Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7303c5fd1e..a38a3edb07d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3596,7 +3596,7 @@ unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL); unsigned int hmp_next_up_threshold = 4096; unsigned int hmp_next_down_threshold = 4096; -static unsigned int hmp_up_migration(int cpu, struct sched_entity *se); +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se); static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, int *min_cpu); @@ -4032,8 +4032,7 @@ unlock: rcu_read_unlock(); #ifdef CONFIG_SCHED_HMP - if (hmp_up_migration(prev_cpu, &p->se)) { - new_cpu = hmp_select_faster_cpu(p, prev_cpu); + if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) { hmp_next_up_delay(&p->se, new_cpu); trace_sched_hmp_migrate(p, new_cpu, 0); return new_cpu; @@ -6320,12 +6319,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #ifdef CONFIG_SCHED_HMP /* Check if task should migrate to a faster cpu */ -static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) { struct task_struct *p = task_of(se); struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; u64 now; + if (target_cpu) + *target_cpu = NR_CPUS; + if (hmp_cpu_is_fastest(cpu)) return 0; @@ -6334,6 +6336,8 @@ static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) if (p->prio >= hmp_up_prio) return 0; #endif + if (se->avg.load_avg_ratio < hmp_up_threshold) + return 0; /* Let the task load settle before doing another up migration */ now = cfs_rq_clock_task(cfs_rq); @@ -6341,15 +6345,15 @@ static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) < hmp_next_up_threshold) return 0; - if (se->avg.load_avg_ratio > hmp_up_threshold) { - /* Target domain load < ~94% */ - if (hmp_domain_min_load(hmp_faster_domain(cpu), NULL) - > NICE_0_LOAD-64) - return 0; - if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, - tsk_cpus_allowed(p))) - return 1; - } + /* Target domain load < 94% */ + if (hmp_domain_min_load(hmp_faster_domain(cpu), target_cpu) + > NICE_0_LOAD-64) + return 0; + + if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(p))) + return 1; + return 0; } @@ -6542,7 +6546,7 @@ static DEFINE_SPINLOCK(hmp_force_migration); */ static void hmp_force_up_migration(int this_cpu) { - int cpu; + int cpu, target_cpu; struct sched_entity *curr; struct rq *target; unsigned long flags; @@ -6570,10 +6574,10 @@ static void hmp_force_up_migration(int this_cpu) } } p = task_of(curr); - if (hmp_up_migration(cpu, curr)) { + if (hmp_up_migration(cpu, &target_cpu, curr)) { if (!target->active_balance) { target->active_balance = 1; - target->push_cpu = hmp_select_faster_cpu(p, cpu); + target->push_cpu = target_cpu; target->migrate_task = p; force = 1; trace_sched_hmp_migrate(p, target->push_cpu, 1); -- cgit v1.2.3 From 954978dd2cff81cc15745b9e581a1709e238f8ef Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 17 Jun 2013 16:08:40 +0100 Subject: HMP: Force new non-kernel tasks onto big CPUs until load stabilises Initialise the load stats for new tasks so that they do not see the instability in early task life which makes it so hard to decide which CPU is appropriate. Also, change the fork balance algorithm so that the least loaded of the CPUs in the big cluster is chosen regardless of the bigness of the parent task. This is intended to help performance for applications which use many short-lived tasks. Although best practise is usually to use a thread pool, apps which do not do this should not be subject to the randomness of the early stats. We should ignore real-time threads for forking on big CPUs, but it is not possible to figure out if a new thread is real-time or not at the fork stage. Instead, we prevent kernel threads from getting the initial boost - when they later become real-time they will only be on big if their compute requirements demand it. Signed-off-by: Dietmar Eggemann Signed-off-by: Chris Redpath --- kernel/sched/core.c | 14 ++++++++++++-- kernel/sched/fair.c | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e45295dc33a..4c53da3781e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1618,8 +1618,18 @@ static void __sched_fork(struct task_struct *p) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #ifdef CONFIG_SCHED_HMP - p->se.avg.hmp_last_up_migration = 0; - p->se.avg.hmp_last_down_migration = 0; + /* keep LOAD_AVG_MAX in sync with fair.c if load avg series is changed */ +#define LOAD_AVG_MAX 47742 + if (p->mm) { + p->se.avg.hmp_last_up_migration = 0; + p->se.avg.hmp_last_down_migration = 0; + p->se.avg.load_avg_ratio = 1023; + p->se.avg.load_avg_contrib = + (1023 * scale_load_down(p->se.load.weight)); + p->se.avg.runnable_avg_period = LOAD_AVG_MAX; + p->se.avg.runnable_avg_sum = LOAD_AVG_MAX; + p->se.avg.usage_avg_sum = LOAD_AVG_MAX; + } #endif #endif #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a38a3edb07d..1b784eea661 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3957,6 +3957,28 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) if (p->nr_cpus_allowed == 1) return prev_cpu; +#ifdef CONFIG_SCHED_HMP + /* always put non-kernel forking tasks on a big domain */ + if (p->mm && (sd_flag & SD_BALANCE_FORK)) { + if(hmp_cpu_is_fastest(prev_cpu)) { + struct hmp_domain *hmpdom = list_entry(&hmp_cpu_domain(prev_cpu)->hmp_domains, struct hmp_domain, hmp_domains); + __always_unused int lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu); + if(new_cpu != NR_CPUS && cpumask_test_cpu(new_cpu,tsk_cpus_allowed(p))) + return new_cpu; + else { + new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(p)); + if(new_cpu < nr_cpu_ids) + return new_cpu; + } + } else { + new_cpu = hmp_select_faster_cpu(p, prev_cpu); + if (new_cpu != NR_CPUS) + return new_cpu; + } + } +#endif + if (sd_flag & SD_BALANCE_WAKE) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) want_affine = 1; -- cgit v1.2.3 From 6eada0087366d8aec6bc38348a68f721f538cc5c Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 17 Jun 2013 16:20:37 +0100 Subject: sched: Restrict nohz balance kicks to stay in the HMP domain There is little point in doing a nohz balance kick on a CPU from a different HMP domain, since the unset SD_LOAD_BALANCE flag on the CPU domain level prevents tasks from being balanced across clusters except through the per-task load driven hmp_migrate/hmp_offload paths. Further, the nohz balance kick is actively harmful to power usage if all the tasks fit into the little domain since it causes the big domain to wake up and do a lot of calculation to determine that there is nothing to do. A more generic solution is to walk the sched domain tree and determine the intersection of potential idle balance cpus with visibility of tasks on the current CPU, however HMP domains are more easily accessible. Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b784eea661..c849d68a9b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6028,7 +6028,11 @@ static struct { static inline int find_new_ilb(int call_cpu) { int ilb = cpumask_first(nohz.idle_cpus_mask); - +#ifdef CONFIG_SCHED_HMP + /* restrict nohz balancing to occur in the same hmp domain */ + ilb = cpumask_first_and(nohz.idle_cpus_mask, + &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus); +#endif if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; @@ -6307,6 +6311,18 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) if (time_before(now, nohz.next_balance)) return 0; +#ifdef CONFIG_SCHED_HMP + /* + * Bail out if there are no nohz CPUs in our + * HMP domain, since we will move tasks between + * domains through wakeup and force balancing + * as necessary based upon task load. + */ + if (cpumask_first_and(nohz.idle_cpus_mask, + &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids) + return 0; +#endif + if (rq->nr_running >= 2) goto need_kick; -- cgit v1.2.3 From 4ab2679351e9566a6b0822f2d841a902758ba066 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 21 Jun 2013 17:50:08 +0100 Subject: HMP: experimental: Force all rt tasks to start on little domain. This patch restricts the allowed cpu mask for rt tasks initially started with a full cpu mask to the little domain. An rt task is specified as real time in __setscheduler() which is finally called for all rt tasks (kernel and user land). In this function we restrict the allowed cpu mask to the little domain. This also prevents that a rt tasks can later be pushed to the big domain because the function find_lowest_rq() will only recognize the allowed cpu mask of a task to find the new cpu the task runs on. Current kludges of the patch: * Since we do not have an API to get the cpu mask of the A7 cluster, hmp_slow_cpu_mask is made global in arm/kernel/topology.c for now. * The watchdog_enable() function calls sched_setscheduler() before kthread_bind() for the cpu specific watchdog kernel threads. The order of these two calls has to be changed to make this patch work. Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 3 ++- kernel/sched/core.c | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 9047dd1c5a1..4459c0b4e91 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -367,10 +367,11 @@ void __init arch_get_fast_and_slow_cpus(struct cpumask *fast, cpumask_clear(slow); } +struct cpumask hmp_slow_cpu_mask; + void __init arch_get_hmp_domains(struct list_head *hmp_domains_list) { struct cpumask hmp_fast_cpu_mask; - struct cpumask hmp_slow_cpu_mask; struct hmp_domain *domain; arch_get_fast_and_slow_cpus(&hmp_fast_cpu_mask, &hmp_slow_cpu_mask); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4c53da3781e..50d9e9849ce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3827,6 +3827,8 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } +extern struct cpumask hmp_slow_cpu_mask; + /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) @@ -3836,8 +3838,13 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - if (rt_prio(p->prio)) + if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; +#ifdef CONFIG_SCHED_HMP + if (cpumask_equal(&p->cpus_allowed, cpu_all_mask)) + do_set_cpus_allowed(p, &hmp_slow_cpu_mask); +#endif + } else p->sched_class = &fair_sched_class; set_load_weight(p); -- cgit v1.2.3