From 9a0758156e5f7f2f609617eb342e476378ef63f2 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 22 Nov 2013 13:19:18 +0000 Subject: sched: hmp: Fix build breakage when not using CONFIG_SCHED_HMP hmp_variable_scale_convert was used without guards in __update_entity_runnable_avg. Guard it. Signed-off-by: Chris Redpath Signed-off-by: Mark Brown Signed-off-by: Jon Medhurst --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c7d808ee0a3..8a4a02740f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1210,6 +1210,7 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#ifdef CONFIG_SCHED_HMP #define HMP_VARIABLE_SCALE_SHIFT 16ULL struct hmp_global_attr { struct attribute attr; @@ -1291,6 +1292,7 @@ struct cpufreq_extents { static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS]; #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_SCHED_HMP */ /* We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -1336,8 +1338,9 @@ static __always_inline int __update_entity_runnable_avg(u64 now, #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ delta = now - sa->last_runnable_update; - +#ifdef CONFIG_SCHED_HMP delta = hmp_variable_scale_convert(delta); +#endif /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. -- cgit v1.2.3 From 42f95a9ca82e0931ba134c9ec180ab7ae8d90dcc Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 25 Jun 2013 13:33:36 +0530 Subject: sched/debug: Add load-tracking statistics to task At present we print per-entity load-tracking statistics for cfs_rq of cgroups/runqueues. Given that per task statistics is maintained, it can be used to know the contribution made by the task to its parenting cfs_rq level. This patch adds per-task load-tracking statistics to /proc//sched. Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130625080336.GA20175@linux.vnet.ibm.com Signed-off-by: Ingo Molnar (cherry picked from commit 939fd731eb88a0cdd9058d0b0143563172a217d7) Signed-off-by: Jon Medhurst --- kernel/sched/debug.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fbd8caa83ef..6b7b86cfaba 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -569,6 +569,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + P(se.avg.runnable_avg_sum); + P(se.avg.runnable_avg_period); + P(se.avg.load_avg_contrib); + P(se.avg.decay_count); +#endif P(policy); P(prio); #undef PN -- cgit v1.2.3 From edecdef95ad50910270915aa0cdcf2f624301cac Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 21 Jan 2014 09:48:55 +0000 Subject: config: Add config fragments for big LITTLE MP This patch adds config fragments used to enable most of the features used by big LITTLE MP. This patch is the result of merging the following commits from the Linaro Stable Kernel (LSK) 3.10 tree... Commit 313c69010ed52a7bd52095708049e16c380c8e15 Commit 83952e566d1c41b2e0a70321e9d56cb8162f9a6d Commit c4a782d985da03016e2b15aaef90bfc5cfe426f6 Commit 2732b0467a356e7cc08bf1e0f33481330335657a Commit 7c88e2bd50fc093c96c912090b712b682ae72641 Commit 3231c8ce4b005f79ad27309907adc3de319e341c Signed-off-by: Jon Medhurst --- linaro/configs/big-LITTLE-MP.conf | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 linaro/configs/big-LITTLE-MP.conf diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf new file mode 100644 index 00000000000..0bbc603a13e --- /dev/null +++ b/linaro/configs/big-LITTLE-MP.conf @@ -0,0 +1,11 @@ +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_NO_HZ=y +CONFIG_SCHED_MC=y +CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y +CONFIG_SCHED_HMP=y +CONFIG_HMP_FAST_CPU_MASK="" +CONFIG_HMP_SLOW_CPU_MASK="" +CONFIG_HMP_VARIABLE_SCALE=y +CONFIG_HMP_FREQUENCY_INVARIANT_SCALE=y -- cgit v1.2.3 From 6bfbe7d9f891dc145ed395cf2b8c23b2dd9c74e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 16:59:47 +0200 Subject: genirq: Add default affinity mask command line option If we isolate CPUs, then we don't want random device interrupts on them. Even w/o the user space irq balancer enabled we can end up with irqs on non boot cpus. Allow to restrict the default irq affinity mask. Signed-off-by: Thomas Gleixner Signed-off-by: Jon Medhurst --- Documentation/kernel-parameters.txt | 9 +++++++++ kernel/irq/irqdesc.c | 21 +++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2fe6e767b3d..a2a5f4111a3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1240,6 +1240,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. See comment before ip2_setup() in drivers/char/ip2/ip2base.c. + irqaffinity= [SMP] Set the default irq affinity mask + Format: + ,..., + or + - + (must be a positive range in ascending order) + or a mixture + ,...,- + irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 192a302d6cf..473b2b6eccb 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -23,10 +23,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) -- cgit v1.2.3 From f720a920e88f1ec79db8c9f0031f61c610e40b02 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 5 Dec 2013 15:49:32 +0000 Subject: sched: reset blocked load decay_count during synchronization If an entity happens to sleep for less than one tick duration the tracked load associated with that entity can be decayed by an unexpectedly large amount if it is later migrated to a different CPU. This can interfere with correct scheduling when entity load is used for decision making. The reason for this is that when an entity is dequeued and enqueued quickly, such that se.avg.decay_count and cfs_rq.decay_counter do not differ when that entity is enqueued again, __synchronize_entity_decay skips the calculation step and also skips clearing the decay_count. At a later time that entity may be migrated and its load will be decayed incorrectly. All users of this function expect decay_count to be zero'ed after use. Signed-off-by: Chris Redpath Signed-off-by: Jon Medhurst --- kernel/sched/fair.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a4a02740f0..1d6394b9616 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1453,12 +1453,9 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; - if (!decays) - return 0; - - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + if (decays) + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); se->avg.decay_count = 0; - return decays; } -- cgit v1.2.3 From 7896b1e659db571556436b99ebb2e475e54a24f5 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 5 Dec 2013 17:57:46 +0000 Subject: sched: update runqueue clock before migrations away If we migrate a sleeping task away from a CPU which has the tick stopped, then both the clock_task and decay_counter will be out of date for that CPU and we will not decay load correctly regardless of how often we update the blocked load. This is only an issue for tasks which are not on a runqueue (because otherwise that CPU would be awake) and simultaneously the CPU the task previously ran on has had the tick stopped. Signed-off-by: Chris Redpath Signed-off-by: Jon Medhurst --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d6394b9616..383cd134705 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4346,6 +4346,16 @@ unlock: * load-balance). */ #ifdef CONFIG_FAIR_GROUP_SCHED + +#ifdef CONFIG_NO_HZ_COMMON +static int nohz_test_cpu(int cpu); +#else +static inline int nohz_test_cpu(int cpu) +{ + return 0; +} +#endif + /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -4365,6 +4375,25 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) * be negative here since on-rq tasks have decay-count == 0. */ if (se->avg.decay_count) { + /* + * If we migrate a sleeping task away from a CPU + * which has the tick stopped, then both the clock_task + * and decay_counter will be out of date for that CPU + * and we will not decay load correctly. + */ + if (!se->on_rq && nohz_test_cpu(task_cpu(p))) { + struct rq *rq = cpu_rq(task_cpu(p)); + unsigned long flags; + /* + * Current CPU cannot be holding rq->lock in this + * circumstance, but another might be. We must hold + * rq->lock before we go poking around in its clocks + */ + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_blocked_load(cfs_rq, 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } se->avg.decay_count = -__synchronize_entity_decay(se); atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } @@ -6297,6 +6326,18 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; +/* + * nohz_test_cpu used when load tracking is enabled. FAIR_GROUP_SCHED + * dependency below may be removed when load tracking guards are + * removed. + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static int nohz_test_cpu(int cpu) +{ + return cpumask_test_cpu(cpu, nohz.idle_cpus_mask); +} +#endif + #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING /* * Decide if the tasks on the busy CPUs in the -- cgit v1.2.3 From 257e5075a1433513bb354f202adcd2dea8a8dc08 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 9 Jan 2014 10:38:54 +0000 Subject: sched: hmp: Make idle balance behaviour normal when packing disabled The presence of packing permanently changed the idle balance behaviour. Do not restrict idle balance on the smallest CPUs when packing is present but disabled. Signed-off-by: Chris Redpath Signed-off-by: Jon Medhurst --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 383cd134705..28debe3047b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6350,6 +6350,10 @@ static int hmp_packing_ilb_needed(int cpu) if (!hmp_cpu_is_slowest(cpu)) return 1; + /* if disabled, use normal ILB behaviour */ + if (!hmp_packing_enabled) + return 1; + hmp = hmp_cpu_domain(cpu); for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) { /* only idle balance if a CPU is loaded over threshold */ -- cgit v1.2.3 From ba8ed8301f5bca4a44c80e2173c66391b76898df Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 9 Jan 2014 10:40:30 +0000 Subject: sched: hmp: Change TC2 packing config to disabled default if present Since TC2 power curves don't really have a utilisation hotspot where packing makes sense, if it is present for a TC2 system at least make it default to disabled. Signed-off-by: Chris Redpath Signed-off-by: Jon Medhurst --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28debe3047b..d89f4a8ddf4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3676,12 +3676,13 @@ unsigned int hmp_next_up_threshold = 4096; unsigned int hmp_next_down_threshold = 4096; #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING -unsigned int hmp_packing_enabled = 1; #ifndef CONFIG_ARCH_VEXPRESS_TC2 +unsigned int hmp_packing_enabled = 1; unsigned int hmp_full_threshold = (NICE_0_LOAD * 9) / 8; #else /* TC2 has a sharp consumption curve @ around 800Mhz, so we aim to spread the load around that frequency. */ +unsigned int hmp_packing_enabled; unsigned int hmp_full_threshold = 650; /* 80% of the 800Mhz freq * NICE_0_LOAD */ #endif #endif -- cgit v1.2.3 From 5e0791511a938eaf28d9071b411ffa71a79ef8ed Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 9 Jan 2014 10:41:13 +0000 Subject: config: Make packing present on TC2 The scheduler will default packing to disabled, but this includes the feature so that we can test it more easily. Signed-off-by: Chris Redpath Signed-off-by: Jon Medhurst --- linaro/configs/big-LITTLE-MP.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf index 0bbc603a13e..ced3cf974f1 100644 --- a/linaro/configs/big-LITTLE-MP.conf +++ b/linaro/configs/big-LITTLE-MP.conf @@ -9,3 +9,4 @@ CONFIG_HMP_FAST_CPU_MASK="" CONFIG_HMP_SLOW_CPU_MASK="" CONFIG_HMP_VARIABLE_SCALE=y CONFIG_HMP_FREQUENCY_INVARIANT_SCALE=y +CONFIG_SCHED_HMP_LITTLE_PACKING=y -- cgit v1.2.3 From b2fafaba35f490947b78e8d0d4f4264a137e64cd Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 10 Jan 2014 10:34:08 +0000 Subject: sched: hmp: Fix potential task_struct memory leak We use get_task_struct to increment the ref count on a task_struct so that even if the task dies with a pending migration we are still able to read the memory without causing a fault. In the case of non-running tasks, we forgot to decrement the ref count when we are done with the task. Signed-off-by: Chris Redpath Signed-off-by: Jon Medhurst --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d89f4a8ddf4..43857fec77b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7027,13 +7027,13 @@ static void hmp_migrate_runnable_task(struct rq *rq) * with the source rq. */ if (src_rq->active_balance) - return; + goto out; if (src_rq->nr_running <= 1) - return; + goto out; if (task_rq(p) != src_rq) - return; + goto out; /* * Not sure if this applies here but one can never * be too cautious @@ -7068,6 +7068,8 @@ static void hmp_migrate_runnable_task(struct rq *rq) rcu_read_unlock(); double_unlock_balance(src_rq, dst_rq); +out: + put_task_struct(p); } static DEFINE_SPINLOCK(hmp_force_migration); -- cgit v1.2.3 From 1d462599bee9a2f5f3988aafa43feda602d5e188 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 16 Jan 2014 19:44:10 +0000 Subject: HMP: Restrict irq_default_affinity to hmp_slow_cpu_mask This patch limits the default affinity mask for all irqs to the cluster of the little cpus. This patch has the positive side effect that an irq thread which has its IRQTF_RUNTHREAD set inside irq_thread() -> irq_wait_for_interrupt() will not overwrite its struct task_struct->cpus_allowed with a full cpu mask of desc->irq_data.affinity in irq_thread_check_affinity() essentially reverting patch "HMP: experimental: Force all rt tasks to start on little domain." for this irq thread. Signed-off-by: Dietmar Eggemann Signed-off-by: Jon Medhurst --- kernel/irq/irqdesc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 473b2b6eccb..20ecfb0984e 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -36,11 +36,19 @@ static int __init irq_affinity_setup(char *str) } __setup("irqaffinity=", irq_affinity_setup); +extern struct cpumask hmp_slow_cpu_mask; + static void __init init_irq_default_affinity(void) { #ifdef CONFIG_CPUMASK_OFFSTACK if (!irq_default_affinity) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif +#ifdef CONFIG_SCHED_HMP + if (!cpumask_empty(&hmp_slow_cpu_mask)) { + cpumask_copy(irq_default_affinity, &hmp_slow_cpu_mask); + return; + } #endif if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); -- cgit v1.2.3 From b30814c74c184bbb231e24d6c857699af338468b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 16 Jan 2014 11:53:14 +0000 Subject: HMP: Fix rt task allowed cpu mask restriction code on 1x1 system There is an error scenario where on a 1x1 HMP system (weight of the hmp_slow_cpu_mask is 1) the short-cut of restricting the allowed cpu mask of an rt tasks leads to triggering a kernel bug in the rt sched class set_cpus_allowed function set_cpus_allowed_rt(). In case the task is on the run-queue and the weight of the required cpu mask is 1 and this is different to the p->nr_cpus_allowed value, this back-end function interprets this in such a way that a task changed from being migratable to not migratable anymore and decrements the rt_nr_migratory counter. There is a BUG_ON(!rq->rt.rt_nr_migratory) check in this code path which triggers in this situation. To circumvent this issue, set the number of allowed cpus for a task p to the weight of the hmp_slow_cpu_mask before calling do_set_cpus_allowed() in __setscheduler(). It will be set to this value in do_set_cpus_allowed() after the call to the sched class related backend function any way. By doing this, set_cpus_allowed_rt() returns without trying to update the rt_nr_migratory counter. This patch has been tested with a test device driver requiring a threaded irq handler on a TC2 system with a reduced cpu mask (1 Cortex A15, 1 Cortex A7). Signed-off-by: Dietmar Eggemann Signed-off-by: Jon Medhurst --- kernel/sched/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb9b7b74a83..3e326f9208f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3842,8 +3842,11 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) p->sched_class = &rt_sched_class; #ifdef CONFIG_SCHED_HMP if (!cpumask_empty(&hmp_slow_cpu_mask)) - if (cpumask_equal(&p->cpus_allowed, cpu_all_mask)) + if (cpumask_equal(&p->cpus_allowed, cpu_all_mask)) { + p->nr_cpus_allowed = + cpumask_weight(&hmp_slow_cpu_mask); do_set_cpus_allowed(p, &hmp_slow_cpu_mask); + } #endif } else -- cgit v1.2.3