aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrey Konovalov <andrey.konovalov@linaro.org>2012-07-16 20:16:55 +0400
committerAndrey Konovalov <andrey.konovalov@linaro.org>2012-07-16 20:16:55 +0400
commitc98c5ee0ecac183b83b12ea12c760640cf50cfee (patch)
tree54f8ced8e93546d98c27051225d2d0909b477317
parentae5c2527853f263b639620fb80b1189ce6b15657 (diff)
parentdc56b26b3ea05693552b85253db6bfdd62c8f763 (diff)
Merge branch 'tracking-big-LITTLE-MP-v3' into merge-linux-linaro-core-trackingtracking-llct-ll-20120716.1llct-20120716.1
-rw-r--r--arch/arm/Kconfig29
-rw-r--r--arch/arm/kernel/topology.c209
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--drivers/cpuidle/Kconfig3
-rw-r--r--drivers/cpuidle/Makefile1
-rw-r--r--drivers/cpuidle/coupled.c715
-rw-r--r--drivers/cpuidle/cpuidle.c68
-rw-r--r--drivers/cpuidle/cpuidle.h32
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.c253
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.h6
-rw-r--r--include/linux/cpuidle.h11
-rw-r--r--include/linux/kthread.h11
-rw-r--r--include/linux/sched.h19
-rw-r--r--include/linux/smpboot.h43
-rw-r--r--include/trace/events/sched.h151
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/kthread.c185
-rw-r--r--kernel/rcutree.c12
-rw-r--r--kernel/rcutree.h15
-rw-r--r--kernel/rcutree_plugin.h403
-rw-r--r--kernel/rcutree_trace.c3
-rw-r--r--kernel/sched/core.c5
-rw-r--r--kernel/sched/debug.c39
-rw-r--r--kernel/sched/fair.c1105
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/sched.h60
-rw-r--r--kernel/smpboot.c229
-rw-r--r--kernel/smpboot.h4
-rw-r--r--kernel/softirq.c107
-rw-r--r--kernel/watchdog.c263
-rw-r--r--linaro/configs/big-LITTLE-MP.conf9
32 files changed, 2973 insertions, 1086 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1d117e639fd4..99d2f1f3e83a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1542,6 +1542,35 @@ config SCHED_SMT
MultiThreading at a cost of slightly increased overhead in some
places. If unsure say N here.
+config DISABLE_CPU_SCHED_DOMAIN_BALANCE
+ bool "(EXPERIMENTAL) Disable CPU level scheduler load-balancing"
+ help
+ Disables scheduler load-balancing at CPU sched domain level.
+
+config SCHED_HMP
+ bool "(EXPERIMENTAL) Heterogenous multiprocessor scheduling"
+ depends on DISABLE_CPU_SCHED_DOMAIN_BALANCE && SCHED_MC && FAIR_GROUP_SCHED && !SCHED_AUTOGROUP
+ help
+ Experimental scheduler optimizations for heterogeneous platforms.
+ Attempts introspectively select task affinity to optimize power
+ and performance. Currently support two types of CPUs: fast
+ (high-performance) and slow (power-efficient). There is currently
+ no support for migration of task groups, hence !SCHED_AUTOGROUP.
+
+config HMP_FAST_CPU_MASK
+ string "HMP scheduler fast CPU mask"
+ depends on SCHED_HMP
+ help
+ Specifies the cpuids of the fast CPUs in the system as a list
+ string, e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_SLOW_CPU_MASK
+ string "HMP scheduler slow CPU mask"
+ depends on SCHED_HMP
+ help
+ Specifies the cpuids of the slow CPUs in the system as a list
+ string, e.g. cpuid 0+1 should be specified as 0-1.
+
config HAVE_ARM_SCU
bool
help
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 8200deaa14f6..f59193cd9aaa 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -17,11 +17,160 @@
#include <linux/percpu.h>
#include <linux/node.h>
#include <linux/nodemask.h>
+#include <linux/of.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <asm/cputype.h>
#include <asm/topology.h>
+/*
+ * cpu power scale management
+ */
+
+/*
+ * cpu power table
+ * This per cpu data structure describes the relative capacity of each core.
+ * On a heteregenous system, cores don't have the same computation capacity
+ * and we reflect that difference in the cpu_power field so the scheduler can
+ * take this difference into account during load balance. A per cpu structure
+ * is preferred because each CPU updates its own cpu_power field during the
+ * load balance except for idle cores. One idle core is selected to run the
+ * rebalance_domains for all idle cores and the cpu_power can be updated
+ * during this sequence.
+ */
+static DEFINE_PER_CPU(unsigned long, cpu_scale);
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(cpu_scale, cpu);
+}
+
+static void set_power_scale(unsigned int cpu, unsigned long power)
+{
+ per_cpu(cpu_scale, cpu) = power;
+}
+
+#ifdef CONFIG_OF
+struct cpu_efficiency {
+ const char *compatible;
+ unsigned long efficiency;
+};
+
+/*
+ * Table of relative efficiency of each processors
+ * The efficiency value must fit in 20bit. The final
+ * cpu_scale value must be in the range
+ * 0 < cpu_scale < 2*SCHED_POWER_SCALE.
+ * Processors that are not defined in the table,
+ * use the default SCHED_POWER_SCALE value for cpu_scale.
+ */
+struct cpu_efficiency table_efficiency[] = {
+ {"arm,cortex-a15", 3891},
+ {"arm,cortex-a7", 2048},
+ {NULL, },
+};
+
+struct cpu_capacity {
+ unsigned long hwid;
+ unsigned long capacity;
+};
+
+struct cpu_capacity *cpu_capacity;
+
+unsigned long middle_capacity = 1;
+
+static void __init parse_dt_topology(void)
+{
+ struct cpu_efficiency *cpu_eff;
+ struct device_node *cn = NULL;
+ unsigned long min_capacity = (unsigned long)(-1);
+ unsigned long max_capacity = 0;
+ unsigned long capacity = 0;
+ int alloc_size, cpu = 0;
+
+ alloc_size = nr_cpu_ids * sizeof(struct cpu_capacity);
+ cpu_capacity = (struct cpu_capacity *)kzalloc(alloc_size, GFP_NOWAIT);
+
+ while ((cn = of_find_node_by_type(cn, "cpu"))) {
+ const u32 *rate, *reg;
+ int len;
+
+ if (cpu >= num_possible_cpus())
+ break;
+
+ for (cpu_eff = table_efficiency; cpu_eff->compatible; cpu_eff++)
+ if (of_device_is_compatible(cn, cpu_eff->compatible))
+ break;
+
+ if (cpu_eff->compatible == NULL)
+ continue;
+
+ rate = of_get_property(cn, "clock-frequency", &len);
+ if (!rate || len != 4) {
+ pr_err("%s missing clock-frequency property\n",
+ cn->full_name);
+ continue;
+ }
+
+ reg = of_get_property(cn, "reg", &len);
+ if (!reg || len != 4) {
+ pr_err("%s missing reg property\n", cn->full_name);
+ continue;
+ }
+
+ capacity = ((be32_to_cpup(rate)) >> 20) * cpu_eff->efficiency;
+
+ /* Save min capacity of the system */
+ if (capacity < min_capacity)
+ min_capacity = capacity;
+
+ /* Save max capacity of the system */
+ if (capacity > max_capacity)
+ max_capacity = capacity;
+
+ cpu_capacity[cpu].capacity = capacity;
+ cpu_capacity[cpu++].hwid = be32_to_cpup(reg);
+ }
+
+ if (cpu < num_possible_cpus())
+ cpu_capacity[cpu].hwid = (unsigned long)(-1);
+
+ middle_capacity = (min_capacity + max_capacity) >> 11;
+}
+
+void update_cpu_power(unsigned int cpu, unsigned long hwid)
+{
+ unsigned int idx = 0;
+
+ /* look for the cpu's hwid in the cpu capacity table */
+ for (idx = 0; idx < num_possible_cpus(); idx++) {
+ if (cpu_capacity[idx].hwid == hwid)
+ break;
+
+ if (cpu_capacity[idx].hwid == -1)
+ return;
+ }
+
+ if (idx == num_possible_cpus())
+ return;
+
+ set_power_scale(cpu, cpu_capacity[idx].capacity / middle_capacity);
+
+ printk(KERN_INFO "CPU%u: update cpu_power %lu\n",
+ cpu, arch_scale_freq_power(NULL, cpu));
+}
+
+#else
+static inline void parse_dt_topology(void) {}
+static inline void update_cpu_power(unsigned int cpuid, unsigned int mpidr) {}
+#endif
+
+
+/*
+ * cpu topology management
+ */
+
#define MPIDR_SMP_BITMASK (0x3 << 30)
#define MPIDR_SMP_VALUE (0x2 << 30)
@@ -31,6 +180,7 @@
* These masks reflect the current use of the affinity levels.
* The affinity level can be up to 16 bits according to ARM ARM
*/
+#define MPIDR_HWID_BITMASK 0xFFFFFF
#define MPIDR_LEVEL0_MASK 0x3
#define MPIDR_LEVEL0_SHIFT 0
@@ -41,6 +191,9 @@
#define MPIDR_LEVEL2_MASK 0xFF
#define MPIDR_LEVEL2_SHIFT 16
+/*
+ * cpu topology table
+ */
struct cputopo_arm cpu_topology[NR_CPUS];
const struct cpumask *cpu_coregroup_mask(int cpu)
@@ -48,6 +201,32 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
return &cpu_topology[cpu].core_sibling;
}
+void update_siblings_masks(unsigned int cpuid)
+{
+ struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
+ int cpu;
+
+ /* update core and thread sibling masks */
+ for_each_possible_cpu(cpu) {
+ cpu_topo = &cpu_topology[cpu];
+
+ if (cpuid_topo->socket_id != cpu_topo->socket_id)
+ continue;
+
+ cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
+ if (cpu != cpuid)
+ cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
+
+ if (cpuid_topo->core_id != cpu_topo->core_id)
+ continue;
+
+ cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
+ if (cpu != cpuid)
+ cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
+ }
+ smp_wmb();
+}
+
/*
* store_cpu_topology is called at boot when only one cpu is running
* and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
@@ -57,7 +236,6 @@ void store_cpu_topology(unsigned int cpuid)
{
struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid];
unsigned int mpidr;
- unsigned int cpu;
/* If the cpu topology has been already set, just return */
if (cpuid_topo->core_id != -1)
@@ -99,26 +277,9 @@ void store_cpu_topology(unsigned int cpuid)
cpuid_topo->socket_id = -1;
}
- /* update core and thread sibling masks */
- for_each_possible_cpu(cpu) {
- struct cputopo_arm *cpu_topo = &cpu_topology[cpu];
-
- if (cpuid_topo->socket_id == cpu_topo->socket_id) {
- cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
- if (cpu != cpuid)
- cpumask_set_cpu(cpu,
- &cpuid_topo->core_sibling);
-
- if (cpuid_topo->core_id == cpu_topo->core_id) {
- cpumask_set_cpu(cpuid,
- &cpu_topo->thread_sibling);
- if (cpu != cpuid)
- cpumask_set_cpu(cpu,
- &cpuid_topo->thread_sibling);
- }
- }
- }
- smp_wmb();
+ update_siblings_masks(cpuid);
+
+ update_cpu_power(cpuid, mpidr & MPIDR_HWID_BITMASK);
printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
cpuid, cpu_topology[cpuid].thread_id,
@@ -134,7 +295,7 @@ void init_cpu_topology(void)
{
unsigned int cpu;
- /* init core mask */
+ /* init core mask and power*/
for_each_possible_cpu(cpu) {
struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]);
@@ -143,6 +304,10 @@ void init_cpu_topology(void)
cpu_topo->socket_id = -1;
cpumask_clear(&cpu_topo->core_sibling);
cpumask_clear(&cpu_topo->thread_sibling);
+
+ set_power_scale(cpu, SCHED_POWER_SCALE);
}
smp_wmb();
+
+ parse_dt_topology();
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6ab6aa2fdfdd..c5981267a60c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
-obj-y += vmware.o hypervisor.o sched.o mshyperv.o
+obj-y += vmware.o hypervisor.o mshyperv.o
obj-y += rdrand.o
obj-y += match.o
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5ad201..000000000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <linux/sched.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/irqflags.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#ifdef CONFIG_SMP
-
-static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
-
-static unsigned long scale_aperfmperf(void)
-{
- struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
- unsigned long ratio, flags;
-
- local_irq_save(flags);
- get_aperfmperf(&val);
- local_irq_restore(flags);
-
- ratio = calc_aperfmperf_ratio(old, &val);
- *old = val;
-
- return ratio;
-}
-
-unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
- /*
- * do aperf/mperf on the cpu level because it includes things
- * like turbo mode, which are relevant to full cores.
- */
- if (boot_cpu_has(X86_FEATURE_APERFMPERF))
- return scale_aperfmperf();
-
- /*
- * maybe have something cpufreq here
- */
-
- return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
- /*
- * aperf/mperf already includes the smt gain
- */
- if (boot_cpu_has(X86_FEATURE_APERFMPERF))
- return SCHED_LOAD_SCALE;
-
- return default_scale_smt_power(sd, cpu);
-}
-
-#endif
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 78a666d1e5f5..a76b689e553b 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -18,3 +18,6 @@ config CPU_IDLE_GOV_MENU
bool
depends on CPU_IDLE && NO_HZ
default y
+
+config ARCH_NEEDS_CPU_IDLE_COUPLED
+ def_bool n
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index 5634f88379df..38c8f69f30cf 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -3,3 +3,4 @@
#
obj-y += cpuidle.o driver.o governor.o sysfs.o governors/
+obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
new file mode 100644
index 000000000000..fc427fa1fefb
--- /dev/null
+++ b/drivers/cpuidle/coupled.c
@@ -0,0 +1,715 @@
+/*
+ * coupled.c - helper functions to enter the same idle state on multiple cpus
+ *
+ * Copyright (c) 2011 Google, Inc.
+ *
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "cpuidle.h"
+
+/**
+ * DOC: Coupled cpuidle states
+ *
+ * On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the
+ * cpus cannot be independently powered down, either due to
+ * sequencing restrictions (on Tegra 2, cpu 0 must be the last to
+ * power down), or due to HW bugs (on OMAP4460, a cpu powering up
+ * will corrupt the gic state unless the other cpu runs a work
+ * around). Each cpu has a power state that it can enter without
+ * coordinating with the other cpu (usually Wait For Interrupt, or
+ * WFI), and one or more "coupled" power states that affect blocks
+ * shared between the cpus (L2 cache, interrupt controller, and
+ * sometimes the whole SoC). Entering a coupled power state must
+ * be tightly controlled on both cpus.
+ *
+ * This file implements a solution, where each cpu will wait in the
+ * WFI state until all cpus are ready to enter a coupled state, at
+ * which point the coupled state function will be called on all
+ * cpus at approximately the same time.
+ *
+ * Once all cpus are ready to enter idle, they are woken by an smp
+ * cross call. At this point, there is a chance that one of the
+ * cpus will find work to do, and choose not to enter idle. A
+ * final pass is needed to guarantee that all cpus will call the
+ * power state enter function at the same time. During this pass,
+ * each cpu will increment the ready counter, and continue once the
+ * ready counter matches the number of online coupled cpus. If any
+ * cpu exits idle, the other cpus will decrement their counter and
+ * retry.
+ *
+ * requested_state stores the deepest coupled idle state each cpu
+ * is ready for. It is assumed that the states are indexed from
+ * shallowest (highest power, lowest exit latency) to deepest
+ * (lowest power, highest exit latency). The requested_state
+ * variable is not locked. It is only written from the cpu that
+ * it stores (or by the on/offlining cpu if that cpu is offline),
+ * and only read after all the cpus are ready for the coupled idle
+ * state are are no longer updating it.
+ *
+ * Three atomic counters are used. alive_count tracks the number
+ * of cpus in the coupled set that are currently or soon will be
+ * online. waiting_count tracks the number of cpus that are in
+ * the waiting loop, in the ready loop, or in the coupled idle state.
+ * ready_count tracks the number of cpus that are in the ready loop
+ * or in the coupled idle state.
+ *
+ * To use coupled cpuidle states, a cpuidle driver must:
+ *
+ * Set struct cpuidle_device.coupled_cpus to the mask of all
+ * coupled cpus, usually the same as cpu_possible_mask if all cpus
+ * are part of the same cluster. The coupled_cpus mask must be
+ * set in the struct cpuidle_device for each cpu.
+ *
+ * Set struct cpuidle_device.safe_state to a state that is not a
+ * coupled state. This is usually WFI.
+ *
+ * Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each
+ * state that affects multiple cpus.
+ *
+ * Provide a struct cpuidle_state.enter function for each state
+ * that affects multiple cpus. This function is guaranteed to be
+ * called on all cpus at approximately the same time. The driver
+ * should ensure that the cpus all abort together if any cpu tries
+ * to abort once the function is called. The function should return
+ * with interrupts still disabled.
+ */
+
+/**
+ * struct cpuidle_coupled - data for set of cpus that share a coupled idle state
+ * @coupled_cpus: mask of cpus that are part of the coupled set
+ * @requested_state: array of requested states for cpus in the coupled set
+ * @ready_waiting_counts: combined count of cpus in ready or waiting loops
+ * @online_count: count of cpus that are online
+ * @refcnt: reference count of cpuidle devices that are using this struct
+ * @prevent: flag to prevent coupled idle while a cpu is hotplugging
+ */
+struct cpuidle_coupled {
+ cpumask_t coupled_cpus;
+ int requested_state[NR_CPUS];
+ atomic_t ready_waiting_counts;
+ int online_count;
+ int refcnt;
+ int prevent;
+};
+
+#define WAITING_BITS 16
+#define MAX_WAITING_CPUS (1 << WAITING_BITS)
+#define WAITING_MASK (MAX_WAITING_CPUS - 1)
+#define READY_MASK (~WAITING_MASK)
+
+#define CPUIDLE_COUPLED_NOT_IDLE (-1)
+
+static DEFINE_MUTEX(cpuidle_coupled_lock);
+static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
+
+/*
+ * The cpuidle_coupled_poked_mask mask is used to avoid calling
+ * __smp_call_function_single with the per cpu call_single_data struct already
+ * in use. This prevents a deadlock where two cpus are waiting for each others
+ * call_single_data struct to be available
+ */
+static cpumask_t cpuidle_coupled_poked_mask;
+
+/**
+ * cpuidle_coupled_parallel_barrier - synchronize all online coupled cpus
+ * @dev: cpuidle_device of the calling cpu
+ * @a: atomic variable to hold the barrier
+ *
+ * No caller to this function will return from this function until all online
+ * cpus in the same coupled group have called this function. Once any caller
+ * has returned from this function, the barrier is immediately available for
+ * reuse.
+ *
+ * The atomic variable a must be initialized to 0 before any cpu calls
+ * this function, will be reset to 0 before any cpu returns from this function.
+ *
+ * Must only be called from within a coupled idle state handler
+ * (state.enter when state.flags has CPUIDLE_FLAG_COUPLED set).
+ *
+ * Provides full smp barrier semantics before and after calling.
+ */
+void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a)
+{
+ int n = dev->coupled->online_count;
+
+ smp_mb__before_atomic_inc();
+ atomic_inc(a);
+
+ while (atomic_read(a) < n)
+ cpu_relax();
+
+ if (atomic_inc_return(a) == n * 2) {
+ atomic_set(a, 0);
+ return;
+ }
+
+ while (atomic_read(a) > n)
+ cpu_relax();
+}
+
+/**
+ * cpuidle_state_is_coupled - check if a state is part of a coupled set
+ * @dev: struct cpuidle_device for the current cpu
+ * @drv: struct cpuidle_driver for the platform
+ * @state: index of the target state in drv->states
+ *
+ * Returns true if the target state is coupled with cpus besides this one
+ */
+bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int state)
+{
+ return drv->states[state].flags & CPUIDLE_FLAG_COUPLED;
+}
+
+/**
+ * cpuidle_coupled_set_ready - mark a cpu as ready
+ * @coupled: the struct coupled that contains the current cpu
+ */
+static inline void cpuidle_coupled_set_ready(struct cpuidle_coupled *coupled)
+{
+ atomic_add(MAX_WAITING_CPUS, &coupled->ready_waiting_counts);
+}
+
+/**
+ * cpuidle_coupled_set_not_ready - mark a cpu as not ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Decrements the ready counter, unless the ready (and thus the waiting) counter
+ * is equal to the number of online cpus. Prevents a race where one cpu
+ * decrements the waiting counter and then re-increments it just before another
+ * cpu has decremented its ready counter, leading to the ready counter going
+ * down from the number of online cpus without going through the coupled idle
+ * state.
+ *
+ * Returns 0 if the counter was decremented successfully, -EINVAL if the ready
+ * counter was equal to the number of online cpus.
+ */
+static
+inline int cpuidle_coupled_set_not_ready(struct cpuidle_coupled *coupled)
+{
+ int all;
+ int ret;
+
+ all = coupled->online_count || (coupled->online_count << WAITING_BITS);
+ ret = atomic_add_unless(&coupled->ready_waiting_counts,
+ -MAX_WAITING_CPUS, all);
+
+ return ret ? 0 : -EINVAL;
+}
+
+/**
+ * cpuidle_coupled_no_cpus_ready - check if no cpus in a coupled set are ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all of the cpus in a coupled set are out of the ready loop.
+ */
+static inline int cpuidle_coupled_no_cpus_ready(struct cpuidle_coupled *coupled)
+{
+ int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS;
+ return r == 0;
+}
+
+/**
+ * cpuidle_coupled_cpus_ready - check if all cpus in a coupled set are ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all cpus coupled to this target state are in the ready loop
+ */
+static inline bool cpuidle_coupled_cpus_ready(struct cpuidle_coupled *coupled)
+{
+ int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS;
+ return r == coupled->online_count;
+}
+
+/**
+ * cpuidle_coupled_cpus_waiting - check if all cpus in a coupled set are waiting
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all cpus coupled to this target state are in the wait loop
+ */
+static inline bool cpuidle_coupled_cpus_waiting(struct cpuidle_coupled *coupled)
+{
+ int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK;
+ return w == coupled->online_count;
+}
+
+/**
+ * cpuidle_coupled_no_cpus_waiting - check if no cpus in coupled set are waiting
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all of the cpus in a coupled set are out of the waiting loop.
+ */
+static inline int cpuidle_coupled_no_cpus_waiting(struct cpuidle_coupled *coupled)
+{
+ int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK;
+ return w == 0;
+}
+
+/**
+ * cpuidle_coupled_get_state - determine the deepest idle state
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns the deepest idle state that all coupled cpus can enter
+ */
+static inline int cpuidle_coupled_get_state(struct cpuidle_device *dev,
+ struct cpuidle_coupled *coupled)
+{
+ int i;
+ int state = INT_MAX;
+
+ /*
+ * Read barrier ensures that read of requested_state is ordered after
+ * reads of ready_count. Matches the write barriers
+ * cpuidle_set_state_waiting.
+ */
+ smp_rmb();
+
+ for_each_cpu_mask(i, coupled->coupled_cpus)
+ if (cpu_online(i) && coupled->requested_state[i] < state)
+ state = coupled->requested_state[i];
+
+ return state;
+}
+
+static void cpuidle_coupled_poked(void *info)
+{
+ int cpu = (unsigned long)info;
+ cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask);
+}
+
+/**
+ * cpuidle_coupled_poke - wake up a cpu that may be waiting
+ * @cpu: target cpu
+ *
+ * Ensures that the target cpu exits it's waiting idle state (if it is in it)
+ * and will see updates to waiting_count before it re-enters it's waiting idle
+ * state.
+ *
+ * If cpuidle_coupled_poked_mask is already set for the target cpu, that cpu
+ * either has or will soon have a pending IPI that will wake it out of idle,
+ * or it is currently processing the IPI and is not in idle.
+ */
+static void cpuidle_coupled_poke(int cpu)
+{
+ struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
+
+ if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poked_mask))
+ __smp_call_function_single(cpu, csd, 0);
+}
+
+/**
+ * cpuidle_coupled_poke_others - wake up all other cpus that may be waiting
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Calls cpuidle_coupled_poke on all other online cpus.
+ */
+static void cpuidle_coupled_poke_others(int this_cpu,
+ struct cpuidle_coupled *coupled)
+{
+ int cpu;
+
+ for_each_cpu_mask(cpu, coupled->coupled_cpus)
+ if (cpu != this_cpu && cpu_online(cpu))
+ cpuidle_coupled_poke(cpu);
+}
+
+/**
+ * cpuidle_coupled_set_waiting - mark this cpu as in the wait loop
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ * @next_state: the index in drv->states of the requested state for this cpu
+ *
+ * Updates the requested idle state for the specified cpuidle device,
+ * poking all coupled cpus out of idle if necessary to let them see the new
+ * state.
+ */
+static void cpuidle_coupled_set_waiting(int cpu,
+ struct cpuidle_coupled *coupled, int next_state)
+{
+ int w;
+
+ coupled->requested_state[cpu] = next_state;
+
+ /*
+ * If this is the last cpu to enter the waiting state, poke
+ * all the other cpus out of their waiting state so they can
+ * enter a deeper state. This can race with one of the cpus
+ * exiting the waiting state due to an interrupt and
+ * decrementing waiting_count, see comment below.
+ *
+ * The atomic_inc_return provides a write barrier to order the write
+ * to requested_state with the later write that increments ready_count.
+ */
+ w = atomic_inc_return(&coupled->ready_waiting_counts) & WAITING_MASK;
+ if (w == coupled->online_count)
+ cpuidle_coupled_poke_others(cpu, coupled);
+}
+
+/**
+ * cpuidle_coupled_set_not_waiting - mark this cpu as leaving the wait loop
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Removes the requested idle state for the specified cpuidle device.
+ */
+static void cpuidle_coupled_set_not_waiting(int cpu,
+ struct cpuidle_coupled *coupled)
+{
+ /*
+ * Decrementing waiting count can race with incrementing it in
+ * cpuidle_coupled_set_waiting, but that's OK. Worst case, some
+ * cpus will increment ready_count and then spin until they
+ * notice that this cpu has cleared it's requested_state.
+ */
+ atomic_dec(&coupled->ready_waiting_counts);
+
+ coupled->requested_state[cpu] = CPUIDLE_COUPLED_NOT_IDLE;
+}
+
+/**
+ * cpuidle_coupled_set_done - mark this cpu as leaving the ready loop
+ * @cpu: the current cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Marks this cpu as no longer in the ready and waiting loops. Decrements
+ * the waiting count first to prevent another cpu looping back in and seeing
+ * this cpu as waiting just before it exits idle.
+ */
+static void cpuidle_coupled_set_done(int cpu, struct cpuidle_coupled *coupled)
+{
+ cpuidle_coupled_set_not_waiting(cpu, coupled);
+ atomic_sub(MAX_WAITING_CPUS, &coupled->ready_waiting_counts);
+}
+
+/**
+ * cpuidle_coupled_clear_pokes - spin until the poke interrupt is processed
+ * @cpu - this cpu
+ *
+ * Turns on interrupts and spins until any outstanding poke interrupts have
+ * been processed and the poke bit has been cleared.
+ *
+ * Other interrupts may also be processed while interrupts are enabled, so
+ * need_resched() must be tested after turning interrupts off again to make sure
+ * the interrupt didn't schedule work that should take the cpu out of idle.
+ *
+ * Returns 0 if need_resched was false, -EINTR if need_resched was true.
+ */
+static int cpuidle_coupled_clear_pokes(int cpu)
+{
+ local_irq_enable();
+ while (cpumask_test_cpu(cpu, &cpuidle_coupled_poked_mask))
+ cpu_relax();
+ local_irq_disable();
+
+ return need_resched() ? -EINTR : 0;
+}
+
+/**
+ * cpuidle_enter_state_coupled - attempt to enter a state with coupled cpus
+ * @dev: struct cpuidle_device for the current cpu
+ * @drv: struct cpuidle_driver for the platform
+ * @next_state: index of the requested state in drv->states
+ *
+ * Coordinate with coupled cpus to enter the target state. This is a two
+ * stage process. In the first stage, the cpus are operating independently,
+ * and may call into cpuidle_enter_state_coupled at completely different times.
+ * To save as much power as possible, the first cpus to call this function will
+ * go to an intermediate state (the cpuidle_device's safe state), and wait for
+ * all the other cpus to call this function. Once all coupled cpus are idle,
+ * the second stage will start. Each coupled cpu will spin until all cpus have
+ * guaranteed that they will call the target_state.
+ *
+ * This function must be called with interrupts disabled. It may enable
+ * interrupts while preparing for idle, and it will always return with
+ * interrupts enabled.
+ */
+int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int next_state)
+{
+ int entered_state = -1;
+ struct cpuidle_coupled *coupled = dev->coupled;
+
+ if (!coupled)
+ return -EINVAL;
+
+ while (coupled->prevent) {
+ if (cpuidle_coupled_clear_pokes(dev->cpu)) {
+ local_irq_enable();
+ return entered_state;
+ }
+ entered_state = cpuidle_enter_state(dev, drv,
+ dev->safe_state_index);
+ }
+
+ /* Read barrier ensures online_count is read after prevent is cleared */
+ smp_rmb();
+
+ cpuidle_coupled_set_waiting(dev->cpu, coupled, next_state);
+
+retry:
+ /*
+ * Wait for all coupled cpus to be idle, using the deepest state
+ * allowed for a single cpu.
+ */
+ while (!cpuidle_coupled_cpus_waiting(coupled)) {
+ if (cpuidle_coupled_clear_pokes(dev->cpu)) {
+ cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+ goto out;
+ }
+
+ if (coupled->prevent) {
+ cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+ goto out;
+ }
+
+ entered_state = cpuidle_enter_state(dev, drv,
+ dev->safe_state_index);
+ }
+
+ if (cpuidle_coupled_clear_pokes(dev->cpu)) {
+ cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+ goto out;
+ }
+
+ /*
+ * All coupled cpus are probably idle. There is a small chance that
+ * one of the other cpus just became active. Increment the ready count,
+ * and spin until all coupled cpus have incremented the counter. Once a
+ * cpu has incremented the ready counter, it cannot abort idle and must
+ * spin until either all cpus have incremented the ready counter, or
+ * another cpu leaves idle and decrements the waiting counter.
+ */
+
+ cpuidle_coupled_set_ready(coupled);
+ while (!cpuidle_coupled_cpus_ready(coupled)) {
+ /* Check if any other cpus bailed out of idle. */
+ if (!cpuidle_coupled_cpus_waiting(coupled))
+ if (!cpuidle_coupled_set_not_ready(coupled))
+ goto retry;
+
+ cpu_relax();
+ }
+
+ /* all cpus have acked the coupled state */
+ next_state = cpuidle_coupled_get_state(dev, coupled);
+
+ entered_state = cpuidle_enter_state(dev, drv, next_state);
+
+ cpuidle_coupled_set_done(dev->cpu, coupled);
+
+out:
+ /*
+ * Normal cpuidle states are expected to return with irqs enabled.
+ * That leads to an inefficiency where a cpu receiving an interrupt
+ * that brings it out of idle will process that interrupt before
+ * exiting the idle enter function and decrementing ready_count. All
+ * other cpus will need to spin waiting for the cpu that is processing
+ * the interrupt. If the driver returns with interrupts disabled,
+ * all other cpus will loop back into the safe idle state instead of
+ * spinning, saving power.
+ *
+ * Calling local_irq_enable here allows coupled states to return with
+ * interrupts disabled, but won't cause problems for drivers that
+ * exit with interrupts enabled.
+ */
+ local_irq_enable();
+
+ /*
+ * Wait until all coupled cpus have exited idle. There is no risk that
+ * a cpu exits and re-enters the ready state because this cpu has
+ * already decremented its waiting_count.
+ */
+ while (!cpuidle_coupled_no_cpus_ready(coupled))
+ cpu_relax();
+
+ return entered_state;
+}
+
+static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled)
+{
+ cpumask_t cpus;
+ cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus);
+ coupled->online_count = cpumask_weight(&cpus);
+}
+
+/**
+ * cpuidle_coupled_register_device - register a coupled cpuidle device
+ * @dev: struct cpuidle_device for the current cpu
+ *
+ * Called from cpuidle_register_device to handle coupled idle init. Finds the
+ * cpuidle_coupled struct for this set of coupled cpus, or creates one if none
+ * exists yet.
+ */
+int cpuidle_coupled_register_device(struct cpuidle_device *dev)
+{
+ int cpu;
+ struct cpuidle_device *other_dev;
+ struct call_single_data *csd;
+ struct cpuidle_coupled *coupled;
+
+ if (cpumask_empty(&dev->coupled_cpus))
+ return 0;
+
+ for_each_cpu_mask(cpu, dev->coupled_cpus) {
+ other_dev = per_cpu(cpuidle_devices, cpu);
+ if (other_dev && other_dev->coupled) {
+ coupled = other_dev->coupled;
+ goto have_coupled;
+ }
+ }
+
+ /* No existing coupled info found, create a new one */
+ coupled = kzalloc(sizeof(struct cpuidle_coupled), GFP_KERNEL);
+ if (!coupled)
+ return -ENOMEM;
+
+ coupled->coupled_cpus = dev->coupled_cpus;
+
+have_coupled:
+ dev->coupled = coupled;
+ if (WARN_ON(!cpumask_equal(&dev->coupled_cpus, &coupled->coupled_cpus)))
+ coupled->prevent++;
+
+ cpuidle_coupled_update_online_cpus(coupled);
+
+ coupled->refcnt++;
+
+ csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);
+ csd->func = cpuidle_coupled_poked;
+ csd->info = (void *)(unsigned long)dev->cpu;
+
+ return 0;
+}
+
+/**
+ * cpuidle_coupled_unregister_device - unregister a coupled cpuidle device
+ * @dev: struct cpuidle_device for the current cpu
+ *
+ * Called from cpuidle_unregister_device to tear down coupled idle. Removes the
+ * cpu from the coupled idle set, and frees the cpuidle_coupled_info struct if
+ * this was the last cpu in the set.
+ */
+void cpuidle_coupled_unregister_device(struct cpuidle_device *dev)
+{
+ struct cpuidle_coupled *coupled = dev->coupled;
+
+ if (cpumask_empty(&dev->coupled_cpus))
+ return;
+
+ if (--coupled->refcnt)
+ kfree(coupled);
+ dev->coupled = NULL;
+}
+
+/**
+ * cpuidle_coupled_prevent_idle - prevent cpus from entering a coupled state
+ * @coupled: the struct coupled that contains the cpu that is changing state
+ *
+ * Disables coupled cpuidle on a coupled set of cpus. Used to ensure that
+ * cpu_online_mask doesn't change while cpus are coordinating coupled idle.
+ */
+static void cpuidle_coupled_prevent_idle(struct cpuidle_coupled *coupled)
+{
+ int cpu = get_cpu();
+
+ /* Force all cpus out of the waiting loop. */
+ coupled->prevent++;
+ cpuidle_coupled_poke_others(cpu, coupled);
+ put_cpu();
+ while (!cpuidle_coupled_no_cpus_waiting(coupled))
+ cpu_relax();
+}
+
+/**
+ * cpuidle_coupled_allow_idle - allows cpus to enter a coupled state
+ * @coupled: the struct coupled that contains the cpu that is changing state
+ *
+ * Enables coupled cpuidle on a coupled set of cpus. Used to ensure that
+ * cpu_online_mask doesn't change while cpus are coordinating coupled idle.
+ */
+static void cpuidle_coupled_allow_idle(struct cpuidle_coupled *coupled)
+{
+ int cpu = get_cpu();
+
+ /*
+ * Write barrier ensures readers see the new online_count when they
+ * see prevent == false.
+ */
+ smp_wmb();
+ coupled->prevent--;
+ /* Force cpus out of the prevent loop. */
+ cpuidle_coupled_poke_others(cpu, coupled);
+ put_cpu();
+}
+
+/**
+ * cpuidle_coupled_cpu_notify - notifier called during hotplug transitions
+ * @nb: notifier block
+ * @action: hotplug transition
+ * @hcpu: target cpu number
+ *
+ * Called when a cpu is brought on or offline using hotplug. Updates the
+ * coupled cpu set appropriately
+ */
+static int cpuidle_coupled_cpu_notify(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ struct cpuidle_device *dev;
+
+ mutex_lock(&cpuidle_lock);
+
+ dev = per_cpu(cpuidle_devices, cpu);
+ if (!dev->coupled)
+ goto out;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_PREPARE:
+ cpuidle_coupled_prevent_idle(dev->coupled);
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ cpuidle_coupled_update_online_cpus(dev->coupled);
+ /* Fall through */
+ case CPU_UP_CANCELED:
+ case CPU_DOWN_FAILED:
+ cpuidle_coupled_allow_idle(dev->coupled);
+ break;
+ }
+
+out:
+ mutex_unlock(&cpuidle_lock);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpuidle_coupled_cpu_notifier = {
+ .notifier_call = cpuidle_coupled_cpu_notify,
+};
+
+static int __init cpuidle_coupled_init(void)
+{
+ return register_cpu_notifier(&cpuidle_coupled_cpu_notifier);
+}
+core_initcall(cpuidle_coupled_init);
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index d90519cec880..ed1dd859299b 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -92,6 +92,34 @@ int cpuidle_play_dead(void)
}
/**
+ * cpuidle_enter_state - enter the state and update stats
+ * @dev: cpuidle device for this cpu
+ * @drv: cpuidle driver for this cpu
+ * @next_state: index into drv->states of the state to enter
+ */
+int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
+ int next_state)
+{
+ int entered_state;
+
+ entered_state = cpuidle_enter_ops(dev, drv, next_state);
+
+ if (entered_state >= 0) {
+ /* Update cpuidle counters */
+ /* This can be moved to within driver enter routine
+ * but that results in multiple copies of same code.
+ */
+ dev->states_usage[entered_state].time +=
+ (unsigned long long)dev->last_residency;
+ dev->states_usage[entered_state].usage++;
+ } else {
+ dev->last_residency = 0;
+ }
+
+ return entered_state;
+}
+
+/**
* cpuidle_idle_call - the main idle loop
*
* NOTE: no locks or semaphores should be used here
@@ -132,23 +160,15 @@ int cpuidle_idle_call(void)
trace_power_start_rcuidle(POWER_CSTATE, next_state, dev->cpu);
trace_cpu_idle_rcuidle(next_state, dev->cpu);
- entered_state = cpuidle_enter_ops(dev, drv, next_state);
+ if (cpuidle_state_is_coupled(dev, drv, next_state))
+ entered_state = cpuidle_enter_state_coupled(dev, drv,
+ next_state);
+ else
+ entered_state = cpuidle_enter_state(dev, drv, next_state);
trace_power_end_rcuidle(dev->cpu);
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
- if (entered_state >= 0) {
- /* Update cpuidle counters */
- /* This can be moved to within driver enter routine
- * but that results in multiple copies of same code.
- */
- dev->states_usage[entered_state].time +=
- (unsigned long long)dev->last_residency;
- dev->states_usage[entered_state].usage++;
- } else {
- dev->last_residency = 0;
- }
-
/* give the governor an opportunity to reflect on the outcome */
if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev, entered_state);
@@ -376,13 +396,25 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
per_cpu(cpuidle_devices, dev->cpu) = dev;
list_add(&dev->device_list, &cpuidle_detected_devices);
- if ((ret = cpuidle_add_sysfs(cpu_dev))) {
- module_put(cpuidle_driver->owner);
- return ret;
- }
+ ret = cpuidle_add_sysfs(cpu_dev);
+ if (ret)
+ goto err_sysfs;
+
+ ret = cpuidle_coupled_register_device(dev);
+ if (ret)
+ goto err_coupled;
dev->registered = 1;
return 0;
+
+err_coupled:
+ cpuidle_remove_sysfs(cpu_dev);
+ wait_for_completion(&dev->kobj_unregister);
+err_sysfs:
+ list_del(&dev->device_list);
+ per_cpu(cpuidle_devices, dev->cpu) = NULL;
+ module_put(cpuidle_driver->owner);
+ return ret;
}
/**
@@ -432,6 +464,8 @@ void cpuidle_unregister_device(struct cpuidle_device *dev)
wait_for_completion(&dev->kobj_unregister);
per_cpu(cpuidle_devices, dev->cpu) = NULL;
+ cpuidle_coupled_unregister_device(dev);
+
cpuidle_resume_and_unlock();
module_put(cpuidle_driver->owner);
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index 7db186685c27..76e7f696ad8c 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -14,6 +14,8 @@ extern struct list_head cpuidle_detected_devices;
extern struct mutex cpuidle_lock;
extern spinlock_t cpuidle_driver_lock;
extern int cpuidle_disabled(void);
+extern int cpuidle_enter_state(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int next_state);
/* idle loop */
extern void cpuidle_install_idle_handler(void);
@@ -30,4 +32,34 @@ extern void cpuidle_remove_state_sysfs(struct cpuidle_device *device);
extern int cpuidle_add_sysfs(struct device *dev);
extern void cpuidle_remove_sysfs(struct device *dev);
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int state);
+int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int next_state);
+int cpuidle_coupled_register_device(struct cpuidle_device *dev);
+void cpuidle_coupled_unregister_device(struct cpuidle_device *dev);
+#else
+static inline bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int state)
+{
+ return false;
+}
+
+static inline int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int next_state)
+{
+ return -1;
+}
+
+static inline int cpuidle_coupled_register_device(struct cpuidle_device *dev)
+{
+ return 0;
+}
+
+static inline void cpuidle_coupled_unregister_device(struct cpuidle_device *dev)
+{
+}
+#endif
+
#endif /* __DRIVER_CPUIDLE_H */
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index 53589000fd07..a4c9f9d45528 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -42,6 +42,7 @@
*/
#include <linux/slab.h>
+#include <linux/smpboot.h>
#include "ehca_classes.h"
#include "ehca_irq.h"
@@ -652,7 +653,7 @@ void ehca_tasklet_eq(unsigned long data)
ehca_process_eq((struct ehca_shca*)data, 1);
}
-static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
{
int cpu;
unsigned long flags;
@@ -662,17 +663,23 @@ static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
ehca_dmp(cpu_online_mask, cpumask_size(), "");
spin_lock_irqsave(&pool->last_cpu_lock, flags);
- cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_online_mask);
- pool->last_cpu = cpu;
+ while (1) {
+ cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpu_online_mask);
+ pool->last_cpu = cpu;
+ /* Might be on the way out */
+ if (per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active)
+ break;
+ }
spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
return cpu;
}
static void __queue_comp_task(struct ehca_cq *__cq,
- struct ehca_cpu_comp_task *cct)
+ struct ehca_cpu_comp_task *cct,
+ struct task_struct *thread)
{
unsigned long flags;
@@ -683,7 +690,7 @@ static void __queue_comp_task(struct ehca_cq *__cq,
__cq->nr_callbacks++;
list_add_tail(&__cq->entry, &cct->cq_list);
cct->cq_jobs++;
- wake_up(&cct->wait_queue);
+ wake_up_process(thread);
} else
__cq->nr_callbacks++;
@@ -695,6 +702,7 @@ static void queue_comp_task(struct ehca_cq *__cq)
{
int cpu_id;
struct ehca_cpu_comp_task *cct;
+ struct task_struct *thread;
int cq_jobs;
unsigned long flags;
@@ -702,7 +710,8 @@ static void queue_comp_task(struct ehca_cq *__cq)
BUG_ON(!cpu_online(cpu_id));
cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
- BUG_ON(!cct);
+ thread = per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+ BUG_ON(!cct || !thread);
spin_lock_irqsave(&cct->task_lock, flags);
cq_jobs = cct->cq_jobs;
@@ -710,28 +719,25 @@ static void queue_comp_task(struct ehca_cq *__cq)
if (cq_jobs > 0) {
cpu_id = find_next_online_cpu(pool);
cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
- BUG_ON(!cct);
+ thread = per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+ BUG_ON(!cct || !thread);
}
-
- __queue_comp_task(__cq, cct);
+ __queue_comp_task(__cq, cct, thread);
}
static void run_comp_task(struct ehca_cpu_comp_task *cct)
{
struct ehca_cq *cq;
- unsigned long flags;
-
- spin_lock_irqsave(&cct->task_lock, flags);
while (!list_empty(&cct->cq_list)) {
cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
- spin_unlock_irqrestore(&cct->task_lock, flags);
+ spin_unlock_irq(&cct->task_lock);
comp_event_callback(cq);
if (atomic_dec_and_test(&cq->nr_events))
wake_up(&cq->wait_completion);
- spin_lock_irqsave(&cct->task_lock, flags);
+ spin_lock_irq(&cct->task_lock);
spin_lock(&cq->task_lock);
cq->nr_callbacks--;
if (!cq->nr_callbacks) {
@@ -740,159 +746,76 @@ static void run_comp_task(struct ehca_cpu_comp_task *cct)
}
spin_unlock(&cq->task_lock);
}
-
- spin_unlock_irqrestore(&cct->task_lock, flags);
}
-static int comp_task(void *__cct)
+static void comp_task_park(unsigned int cpu)
{
- struct ehca_cpu_comp_task *cct = __cct;
- int cql_empty;
- DECLARE_WAITQUEUE(wait, current);
-
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- add_wait_queue(&cct->wait_queue, &wait);
-
- spin_lock_irq(&cct->task_lock);
- cql_empty = list_empty(&cct->cq_list);
- spin_unlock_irq(&cct->task_lock);
- if (cql_empty)
- schedule();
- else
- __set_current_state(TASK_RUNNING);
-
- remove_wait_queue(&cct->wait_queue, &wait);
+ struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ struct ehca_cpu_comp_task *target;
+ struct task_struct *thread;
+ struct ehca_cq *cq, *tmp;
+ LIST_HEAD(list);
- spin_lock_irq(&cct->task_lock);
- cql_empty = list_empty(&cct->cq_list);
- spin_unlock_irq(&cct->task_lock);
- if (!cql_empty)
- run_comp_task(__cct);
+ spin_lock_irq(&cct->task_lock);
+ cct->cq_jobs = 0;
+ cct->active = 0;
+ list_splice_init(&cct->cq_list, &list);
+ spin_unlock_irq(&cct->task_lock);
- set_current_state(TASK_INTERRUPTIBLE);
+ cpu = find_next_online_cpu(pool);
+ target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ thread = per_cpu_ptr(pool->cpu_comp_threads, cpu);
+ spin_lock_irq(&target->task_lock);
+ list_for_each_entry_safe(cq, tmp, &list, entry) {
+ list_del(&cq->entry);
+ __queue_comp_task(cq, target, thread);
}
- __set_current_state(TASK_RUNNING);
-
- return 0;
-}
-
-static struct task_struct *create_comp_task(struct ehca_comp_pool *pool,
- int cpu)
-{
- struct ehca_cpu_comp_task *cct;
-
- cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- spin_lock_init(&cct->task_lock);
- INIT_LIST_HEAD(&cct->cq_list);
- init_waitqueue_head(&cct->wait_queue);
- cct->task = kthread_create_on_node(comp_task, cct, cpu_to_node(cpu),
- "ehca_comp/%d", cpu);
-
- return cct->task;
+ spin_unlock_irq(&target->task_lock);
}
-static void destroy_comp_task(struct ehca_comp_pool *pool,
- int cpu)
+static void comp_task_stop(unsigned int cpu, bool online)
{
- struct ehca_cpu_comp_task *cct;
- struct task_struct *task;
- unsigned long flags_cct;
-
- cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
- spin_lock_irqsave(&cct->task_lock, flags_cct);
+ struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- task = cct->task;
- cct->task = NULL;
+ spin_lock_irq(&cct->task_lock);
cct->cq_jobs = 0;
-
- spin_unlock_irqrestore(&cct->task_lock, flags_cct);
-
- if (task)
- kthread_stop(task);
+ cct->active = 0;
+ WARN_ON(!list_empty(&cct->cq_list));
+ spin_unlock_irq(&cct->task_lock);
}
-static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu)
+static int comp_task_should_run(unsigned int cpu)
{
struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- LIST_HEAD(list);
- struct ehca_cq *cq;
- unsigned long flags_cct;
-
- spin_lock_irqsave(&cct->task_lock, flags_cct);
-
- list_splice_init(&cct->cq_list, &list);
-
- while (!list_empty(&list)) {
- cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-
- list_del(&cq->entry);
- __queue_comp_task(cq, this_cpu_ptr(pool->cpu_comp_tasks));
- }
-
- spin_unlock_irqrestore(&cct->task_lock, flags_cct);
+ return cct->cq_jobs;
}
-static int __cpuinit comp_pool_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int comp_task(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- struct ehca_cpu_comp_task *cct;
+ struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+ int cql_empty;
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu);
- if (!create_comp_task(pool, cpu)) {
- ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
- return notifier_from_errno(-ENOMEM);
- }
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
- cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- kthread_bind(cct->task, cpumask_any(cpu_online_mask));
- destroy_comp_task(pool, cpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
- cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- kthread_bind(cct->task, cpu);
- wake_up_process(cct->task);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
- break;
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
- destroy_comp_task(pool, cpu);
- take_over_work(pool, cpu);
- break;
+ spin_lock_irq(&cct->task_lock);
+ cql_empty = list_empty(&cct->cq_list);
+ if (!cql_empty) {
+ __set_current_state(TASK_RUNNING);
+ run_comp_task(cct);
}
-
- return NOTIFY_OK;
+ spin_unlock_irq(&cct->task_lock);
}
-static struct notifier_block comp_pool_callback_nb __cpuinitdata = {
- .notifier_call = comp_pool_callback,
- .priority = 0,
+static struct smp_hotplug_thread comp_pool_threads = {
+ .thread_should_run = comp_task_should_run,
+ .thread_fn = comp_task,
+ .thread_comm = "ehca_comp/%u",
+ .cleanup = comp_task_stop,
+ .park = comp_task_park,
};
int ehca_create_comp_pool(void)
{
- int cpu;
- struct task_struct *task;
+ int cpu, ret = -ENOMEM;
if (!ehca_scaling_code)
return 0;
@@ -905,38 +828,46 @@ int ehca_create_comp_pool(void)
pool->last_cpu = cpumask_any(cpu_online_mask);
pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
- if (pool->cpu_comp_tasks == NULL) {
- kfree(pool);
- return -EINVAL;
- }
+ if (!pool->cpu_comp_tasks)
+ goto out_pool;
- for_each_online_cpu(cpu) {
- task = create_comp_task(pool, cpu);
- if (task) {
- kthread_bind(task, cpu);
- wake_up_process(task);
- }
+ pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+ if (!pool->cpu_comp_threads)
+ goto out_tasks;
+
+ for_each_present_cpu(cpu) {
+ struct ehca_cpu_comp_task *cct;
+
+ cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ spin_lock_init(&cct->task_lock);
+ INIT_LIST_HEAD(&cct->cq_list);
}
- register_hotcpu_notifier(&comp_pool_callback_nb);
+ comp_pool_threads.store = pool->cpu_comp_threads;
+ ret = smpboot_register_percpu_thread(&comp_pool_threads);
+ if (ret)
+ goto out_threads;
- printk(KERN_INFO "eHCA scaling code enabled\n");
+ pr_info("eHCA scaling code enabled\n");
+ return ret;
- return 0;
+out_threads:
+ free_percpu(pool->cpu_comp_threads);
+out_tasks:
+ free_percpu(pool->cpu_comp_tasks);
+out_pool:
+ kfree(pool);
+ return ret;
}
void ehca_destroy_comp_pool(void)
{
- int i;
-
if (!ehca_scaling_code)
return;
- unregister_hotcpu_notifier(&comp_pool_callback_nb);
-
- for_each_online_cpu(i)
- destroy_comp_task(pool, i);
+ smpboot_unregister_percpu_thread(&comp_pool_threads);
+ free_percpu(pool->cpu_comp_threads);
free_percpu(pool->cpu_comp_tasks);
kfree(pool);
}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
index 3346cb06cea6..5370199f08c7 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.h
+++ b/drivers/infiniband/hw/ehca/ehca_irq.h
@@ -58,15 +58,15 @@ void ehca_tasklet_eq(unsigned long data);
void ehca_process_eq(struct ehca_shca *shca, int is_irq);
struct ehca_cpu_comp_task {
- wait_queue_head_t wait_queue;
struct list_head cq_list;
- struct task_struct *task;
spinlock_t task_lock;
int cq_jobs;
+ int active;
};
struct ehca_comp_pool {
- struct ehca_cpu_comp_task *cpu_comp_tasks;
+ struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+ struct task_struct * __percpu *cpu_comp_threads;
int last_cpu;
spinlock_t last_cpu_lock;
};
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 6c26a3da0e03..5ab7183313ce 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -57,6 +57,7 @@ struct cpuidle_state {
/* Idle State Flags */
#define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */
+#define CPUIDLE_FLAG_COUPLED (0x02) /* state applies to multiple cpus */
#define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
@@ -100,6 +101,12 @@ struct cpuidle_device {
struct list_head device_list;
struct kobject kobj;
struct completion kobj_unregister;
+
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+ int safe_state_index;
+ cpumask_t coupled_cpus;
+ struct cpuidle_coupled *coupled;
+#endif
};
DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
@@ -176,6 +183,10 @@ static inline int cpuidle_play_dead(void) {return -ENODEV; }
#endif
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a);
+#endif
+
/******************************
* CPUIDLE GOVERNOR INTERFACE *
******************************/
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0714b24c0e45..5365347a0dc9 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -14,6 +14,11 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+ void *data,
+ unsigned int cpu,
+ const char *namefmt);
+
/**
* kthread_run - create and wake a thread.
* @threadfn: the function to run until signal_pending(current).
@@ -34,9 +39,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
void kthread_bind(struct task_struct *k, unsigned int cpu);
int kthread_stop(struct task_struct *k);
-int kthread_should_stop(void);
+bool kthread_should_stop(void);
+bool kthread_should_park(void);
bool kthread_freezable_should_stop(bool *was_frozen);
void *kthread_data(struct task_struct *k);
+int kthread_park(struct task_struct *k);
+void kthread_unpark(struct task_struct *k);
+void kthread_parkme(void);
int kthreadd(void *unused);
extern struct task_struct *kthreadd_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 71d51c1c3f59..2b5fba1e9f6a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1100,6 +1100,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
@@ -1134,6 +1135,15 @@ struct load_weight {
unsigned long weight, inv_weight;
};
+struct sched_avg {
+ u32 runnable_avg_sum, runnable_avg_period;
+ u64 last_runnable_update;
+ s64 decay_count;
+ unsigned long load_avg_contrib;
+ unsigned long load_avg_ratio;
+ u32 usage_avg_sum;
+};
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
@@ -1194,6 +1204,15 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ /* Per-entity load-tracking */
+ struct sched_avg avg;
+#endif
};
struct sched_rt_entity {
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
new file mode 100644
index 000000000000..e0106d8581d3
--- /dev/null
+++ b/include/linux/smpboot.h
@@ -0,0 +1,43 @@
+#ifndef _LINUX_SMPBOOT_H
+#define _LINUX_SMPBOOT_H
+
+#include <linux/types.h>
+
+struct task_struct;
+/* Cookie handed to the thread_fn*/
+struct smpboot_thread_data;
+
+/**
+ * struct smp_hotplug_thread - CPU hotplug related thread descriptor
+ * @store: Pointer to per cpu storage for the task pointers
+ * @list: List head for core management
+ * @thread_should_run: Check whether the thread should run or not. Called with
+ * preemption disabled.
+ * @thread_fn: The associated thread function
+ * @setup: Optional setup function, called when the thread gets
+ * operational the first time
+ * @cleanup: Optional cleanup function, called when the thread
+ * should stop (module exit)
+ * @park: Optional park function, called when the thread is
+ * parked (cpu offline)
+ * @unpark: Optional unpark function, called when the thread is
+ * unparked (cpu online)
+ * @thread_comm: The base name of the thread
+ */
+struct smp_hotplug_thread {
+ struct task_struct __percpu **store;
+ struct list_head list;
+ int (*thread_should_run)(unsigned int cpu);
+ void (*thread_fn)(unsigned int cpu);
+ void (*setup)(unsigned int cpu);
+ void (*cleanup)(unsigned int cpu, bool online);
+ void (*park)(unsigned int cpu);
+ void (*unpark)(unsigned int cpu);
+ const char *thread_comm;
+};
+
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_thread_schedule(void);
+
+#endif
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ea7a2035456d..2c50a06fbedd 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -426,6 +426,157 @@ TRACE_EVENT(sched_pi_setprio,
__entry->oldprio, __entry->newprio)
);
+/*
+ * Tracepoint for showing tracked load contribution.
+ */
+TRACE_EVENT(sched_task_load_contrib,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long load_contrib),
+
+ TP_ARGS(tsk, load_contrib),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, load_contrib)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->load_contrib = load_contrib;
+ ),
+
+ TP_printk("comm=%s pid=%d load_contrib=%lu",
+ __entry->comm, __entry->pid,
+ __entry->load_contrib)
+);
+
+/*
+ * Tracepoint for showing tracked task runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_runnable_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_rq_runnable_ratio,
+
+ TP_PROTO(int cpu, unsigned long ratio),
+
+ TP_ARGS(cpu, ratio),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("cpu=%d ratio=%lu",
+ __entry->cpu,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable load.
+ */
+TRACE_EVENT(sched_rq_runnable_load,
+
+ TP_PROTO(int cpu, u64 load),
+
+ TP_ARGS(cpu, load),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(u64, load)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ ),
+
+ TP_printk("cpu=%d load=%llu",
+ __entry->cpu,
+ __entry->load)
+);
+
+/*
+ * Tracepoint for showing tracked task cpu usage ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_usage_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations.
+ */
+TRACE_EVENT(sched_hmp_migrate,
+
+ TP_PROTO(struct task_struct *tsk, int val),
+
+ TP_ARGS(tsk, val),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, val)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->val = val;
+ ),
+
+ TP_printk("comm=%s pid=%d val=%d",
+ __entry->comm, __entry->pid,
+ __entry->val)
+);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8e0da8c0ab5a..a792c9bf345a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
__func__, cpu);
goto out_release;
}
+ smpboot_park_threads(cpu);
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
+ smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-
goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out;
}
+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
@@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
+ /* Wake the per cpu threads */
+ smpboot_unpark_threads(cpu);
+
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..571fa798323b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,11 +37,20 @@ struct kthread_create_info
};
struct kthread {
- int should_stop;
+ unsigned long flags;
+ unsigned int cpu;
void *data;
+ struct completion parked;
struct completion exited;
};
+enum KTHREAD_BITS {
+ KTHREAD_IS_PER_CPU = 0,
+ KTHREAD_SHOULD_STOP,
+ KTHREAD_SHOULD_PARK,
+ KTHREAD_IS_PARKED,
+};
+
#define to_kthread(tsk) \
container_of((tsk)->vfork_done, struct kthread, exited)
@@ -52,13 +61,29 @@ struct kthread {
* and this will return true. You should then return, and your return
* value will be passed through to kthread_stop().
*/
-int kthread_should_stop(void)
+bool kthread_should_stop(void)
{
- return to_kthread(current)->should_stop;
+ return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);
/**
+ * kthread_should_park - should this kthread park now?
+ *
+ * When someone calls kthread_park() on your kthread, it will be woken
+ * and this will return true. You should then do the necessary
+ * cleanup and call kthread_parkme()
+ *
+ * Similar to kthread_should_stop(), but this keeps the thread alive
+ * and in a park position. kthread_unpark() "restarts" the thread and
+ * calls the thread function again.
+ */
+bool kthread_should_park(void)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+}
+
+/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
*
@@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task)
return to_kthread(task)->data;
}
+static void __kthread_parkme(struct kthread *self)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
+ if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
+ complete(&self->parked);
+ schedule();
+ __set_current_state(TASK_INTERRUPTIBLE);
+ }
+ clear_bit(KTHREAD_IS_PARKED, &self->flags);
+ __set_current_state(TASK_RUNNING);
+}
+
+void kthread_parkme(void)
+{
+ __kthread_parkme(to_kthread(current));
+}
+
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -105,9 +148,10 @@ static int kthread(void *_create)
struct kthread self;
int ret;
- self.should_stop = 0;
+ self.flags = 0;
self.data = data;
init_completion(&self.exited);
+ init_completion(&self.parked);
current->vfork_done = &self.exited;
/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -117,9 +161,11 @@ static int kthread(void *_create)
schedule();
ret = -EINTR;
- if (!self.should_stop)
- ret = threadfn(data);
+ if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+ __kthread_parkme(&self);
+ ret = threadfn(data);
+ }
/* we can't just return, we must preserve "self" on stack */
do_exit(ret);
}
@@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create)
* Returns a task_struct or ERR_PTR(-ENOMEM).
*/
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
- void *data,
- int node,
+ void *data, int node,
const char namefmt[],
...)
{
@@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
+static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
+ p->flags |= PF_THREAD_BOUND;
+}
+
/**
* kthread_bind - bind a just-created kthread to a cpu.
* @p: thread created by kthread_create().
@@ -226,14 +278,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
WARN_ON(1);
return;
}
-
- /* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
- p->flags |= PF_THREAD_BOUND;
+ __kthread_bind(p, cpu);
}
EXPORT_SYMBOL(kthread_bind);
/**
+ * kthread_create_on_cpu - Create a cpu bound kthread
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @cpu: The cpu on which the thread should be bound,
+ * @namefmt: printf-style name for the thread. Format is restricted
+ * to "name.*%u". Code fills in cpu number.
+ *
+ * Description: This helper function creates and names a kernel thread
+ * The thread will be woken and put into park mode.
+ */
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+ void *data, unsigned int cpu,
+ const char *namefmt)
+{
+ struct task_struct *p;
+
+ p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
+ cpu);
+ if (IS_ERR(p))
+ return p;
+ set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
+ to_kthread(p)->cpu = cpu;
+ /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
+ kthread_park(p);
+ return p;
+}
+
+static struct kthread *task_get_live_kthread(struct task_struct *k)
+{
+ struct kthread *kthread;
+
+ get_task_struct(k);
+ kthread = to_kthread(k);
+ /* It might have exited */
+ barrier();
+ if (k->vfork_done != NULL)
+ return kthread;
+ return NULL;
+}
+
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
+{
+ struct kthread *kthread = task_get_live_kthread(k);
+
+ if (kthread) {
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu);
+ wake_up_process(k);
+ }
+ }
+ put_task_struct(k);
+}
+
+/**
+ * kthread_park - park a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return true, wakes it, and
+ * waits for it to return. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will park without
+ * calling threadfn().
+ *
+ * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
+ * If called by the kthread itself just the park bit is set.
+ */
+int kthread_park(struct task_struct *k)
+{
+ struct kthread *kthread = task_get_live_kthread(k);
+ int ret = -ENOSYS;
+
+ if (kthread) {
+ if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ wait_for_completion(&kthread->parked);
+ }
+ }
+ ret = 0;
+ }
+ put_task_struct(k);
+ return ret;
+}
+
+/**
* kthread_stop - stop a thread created by kthread_create().
* @k: thread created by kthread_create().
*
@@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind);
*/
int kthread_stop(struct task_struct *k)
{
- struct kthread *kthread;
+ struct kthread *kthread = task_get_live_kthread(k);
int ret;
trace_sched_kthread_stop(k);
- get_task_struct(k);
-
- kthread = to_kthread(k);
- barrier(); /* it might have exited */
- if (k->vfork_done != NULL) {
- kthread->should_stop = 1;
+ if (kthread) {
+ set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4b97bba7396e..a8ba38369efa 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -125,13 +125,12 @@ static int rcu_scheduler_fully_active __read_mostly;
*/
static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -1461,8 +1460,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
/* Adjust any no-longer-needed kthreads. */
- rcu_stop_cpu_kthread(cpu);
- rcu_node_kthread_setaffinity(rnp, -1);
+ rcu_boost_kthread_setaffinity(rnp, -1);
/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
@@ -2516,12 +2514,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- rcu_node_kthread_setaffinity(rnp, -1);
- rcu_cpu_kthread_setrt(cpu, 1);
+ rcu_boost_kthread_setaffinity(rnp, -1);
break;
case CPU_DOWN_PREPARE:
- rcu_node_kthread_setaffinity(rnp, cpu);
- rcu_cpu_kthread_setrt(cpu, 0);
+ rcu_boost_kthread_setaffinity(rnp, cpu);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 19b61ac1079f..1989438c68fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -192,12 +192,6 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
- struct task_struct *node_kthread_task;
- /* kthread that takes care of this rcu_node */
- /* structure, for example, awakening the */
- /* per-CPU kthreads as needed. */
- unsigned int node_kthread_status;
- /* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;
/*
@@ -449,7 +443,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
-static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -479,15 +472,9 @@ static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks(void);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm);
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index);
-static void invoke_rcu_node_kthread(struct rcu_node *rnp);
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+ struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3e4899459f3d..a0871b3cb3de 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
*/
#include <linux/delay.h>
+#include <linux/smpboot.h>
#define RCU_KTHREAD_PRIO 1
@@ -1225,6 +1226,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
+ wake_up_process(t);
+}
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1297,17 +1308,6 @@ static int rcu_boost(struct rcu_node *rnp)
}
/*
- * Timer handler to initiate waking up of boost kthreads that
- * have yielded the CPU due to excessive numbers of tasks to
- * boost. We wake up the per-rcu_node kthread, which in turn
- * will wake up the booster kthread.
- */
-static void rcu_boost_kthread_timer(unsigned long arg)
-{
- invoke_rcu_node_kthread((struct rcu_node *)arg);
-}
-
-/*
* Priority-boosting kthread. One per leaf rcu_node and one for the
* root rcu_node.
*/
@@ -1330,8 +1330,9 @@ static int rcu_boost_kthread(void *arg)
else
spincnt = 0;
if (spincnt > 10) {
+ rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
trace_rcu_utilization("End boost kthread@rcu_yield");
- rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+ schedule_timeout_interruptible(2);
trace_rcu_utilization("Start boost kthread@rcu_yield");
spincnt = 0;
}
@@ -1369,8 +1370,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
t = rnp->boost_kthread_task;
- if (t != NULL)
- wake_up_process(t);
+ if (t)
+ rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1387,8 +1388,10 @@ static void invoke_rcu_callbacks_kthread(void)
local_irq_save(flags);
__this_cpu_write(rcu_cpu_has_work, 1);
if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task))
- wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+ current != __this_cpu_read(rcu_cpu_kthread_task)) {
+ rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+ __this_cpu_read(rcu_cpu_kthread_status));
+ }
local_irq_restore(flags);
}
@@ -1401,21 +1404,6 @@ static bool rcu_is_callbacks_kthread(void)
return __get_cpu_var(rcu_cpu_kthread_task) == current;
}
-/*
- * Set the affinity of the boost kthread. The CPU-hotplug locks are
- * held, so no one should be messing with the existence of the boost
- * kthread.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm)
-{
- struct task_struct *t;
-
- t = rnp->boost_kthread_task;
- if (t != NULL)
- set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1432,15 +1420,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* Returns zero if all is well, a negated errno otherwise.
*/
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index)
+ struct rcu_node *rnp)
{
+ int rnp_index = rnp - &rsp->node[0];
unsigned long flags;
struct sched_param sp;
struct task_struct *t;
if (&rcu_preempt_state != rsp)
return 0;
+
+ if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+ return 0;
+
rsp->boost = 1;
if (rnp->boost_kthread_task != NULL)
return 0;
@@ -1457,25 +1449,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
return 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Stop the RCU's per-CPU kthread when its CPU goes offline,.
- */
-static void rcu_stop_cpu_kthread(int cpu)
-{
- struct task_struct *t;
-
- /* Stop the CPU's kthread. */
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t != NULL) {
- per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
- kthread_stop(t);
- }
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1483,112 +1456,22 @@ static void rcu_kthread_do_work(void)
rcu_preempt_do_callbacks();
}
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+static void rcu_cpu_kthread_setup(unsigned int cpu)
{
- struct task_struct *t;
-
- t = rnp->node_kthread_task;
- if (t != NULL)
- wake_up_process(t);
-}
-
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument. The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
- int policy;
struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t == NULL)
- return;
- if (to_rt) {
- policy = SCHED_FIFO;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- } else {
- policy = SCHED_NORMAL;
- sp.sched_priority = 0;
- }
- sched_setscheduler_nocheck(t, policy, &sp);
-}
-
-/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
-{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
- struct rcu_node *rnp = rdp->mynode;
- atomic_or(rdp->grpmask, &rnp->wakemask);
- invoke_rcu_node_kthread(rnp);
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
}
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted. Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+static void rcu_cpu_kthread_park(unsigned int cpu)
{
- struct sched_param sp;
- struct timer_list yield_timer;
- int prio = current->rt_priority;
-
- setup_timer_on_stack(&yield_timer, f, arg);
- mod_timer(&yield_timer, jiffies + 2);
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
- set_user_nice(current, 19);
- schedule();
- set_user_nice(current, 0);
- sp.sched_priority = prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
- del_timer(&yield_timer);
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
}
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline. We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh. This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
{
- while (cpu_is_offline(cpu) ||
- !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
- smp_processor_id() != cpu) {
- if (kthread_should_stop())
- return 1;
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
- per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
- local_bh_enable();
- schedule_timeout_uninterruptible(1);
- if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- local_bh_disable();
- }
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- return 0;
+ return __get_cpu_var(rcu_cpu_has_work);
}
/*
@@ -1596,138 +1479,35 @@ static int rcu_cpu_kthread_should_stop(int cpu)
* RCU softirq used in flavors and configurations of RCU that do not
* support RCU priority boosting.
*/
-static int rcu_cpu_kthread(void *arg)
+static void rcu_cpu_kthread(unsigned int cpu)
{
- int cpu = (int)(long)arg;
- unsigned long flags;
- int spincnt = 0;
- unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
- char work;
- char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+ unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+ char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+ int spincnt;
- trace_rcu_utilization("Start CPU kthread@init");
- for (;;) {
- *statusp = RCU_KTHREAD_WAITING;
- trace_rcu_utilization("End CPU kthread@rcu_wait");
- rcu_wait(*workp != 0 || kthread_should_stop());
+ for (spincnt = 0; spincnt < 10; spincnt++) {
trace_rcu_utilization("Start CPU kthread@rcu_wait");
local_bh_disable();
- if (rcu_cpu_kthread_should_stop(cpu)) {
- local_bh_enable();
- break;
- }
*statusp = RCU_KTHREAD_RUNNING;
- per_cpu(rcu_cpu_kthread_loops, cpu)++;
- local_irq_save(flags);
+ this_cpu_inc(rcu_cpu_kthread_loops);
+ local_irq_disable();
work = *workp;
*workp = 0;
- local_irq_restore(flags);
+ local_irq_enable();
if (work)
rcu_kthread_do_work();
local_bh_enable();
- if (*workp != 0)
- spincnt++;
- else
- spincnt = 0;
- if (spincnt > 10) {
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization("End CPU kthread@rcu_yield");
- rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
- trace_rcu_utilization("Start CPU kthread@rcu_yield");
- spincnt = 0;
- }
- }
- *statusp = RCU_KTHREAD_STOPPED;
- trace_rcu_utilization("End CPU kthread@term");
- return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task. There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- *
- * Please note that we cannot simply refuse to wake up the per-CPU
- * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
- * which can result in softlockup complaints if the task ends up being
- * idle for more than a couple of minutes.
- *
- * However, please note also that we cannot bind the per-CPU kthread to its
- * CPU until that CPU is fully online. We also cannot wait until the
- * CPU is fully online before we create its per-CPU kthread, as this would
- * deadlock the system when CPU notifiers tried waiting for grace
- * periods. So we bind the per-CPU kthread to its CPU only if the CPU
- * is online. If its CPU is not yet fully online, then the code in
- * rcu_cpu_kthread() will wait until it is fully online, and then do
- * the binding.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
- return 0;
- t = kthread_create_on_node(rcu_cpu_kthread,
- (void *)(long)cpu,
- cpu_to_node(cpu),
- "rcuc/%d", cpu);
- if (IS_ERR(t))
- return PTR_ERR(t);
- if (cpu_online(cpu))
- kthread_bind(t, cpu);
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- per_cpu(rcu_cpu_kthread_task, cpu) = t;
- wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
- return 0;
-}
-
-/*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed. We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
- int cpu;
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp = (struct rcu_node *)arg;
- struct sched_param sp;
- struct task_struct *t;
-
- for (;;) {
- rnp->node_kthread_status = RCU_KTHREAD_WAITING;
- rcu_wait(atomic_read(&rnp->wakemask) != 0);
- rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- mask = atomic_xchg(&rnp->wakemask, 0);
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
- if ((mask & 0x1) == 0)
- continue;
- preempt_disable();
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (!cpu_online(cpu) || t == NULL) {
- preempt_enable();
- continue;
- }
- per_cpu(rcu_cpu_has_work, cpu) = 1;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- preempt_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization("End CPU kthread@rcu_wait");
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
}
}
- /* NOTREACHED */
- rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
- return 0;
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization("Start CPU kthread@rcu_yield");
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization("End CPU kthread@rcu_yield");
+ *statusp = RCU_KTHREAD_WAITING;
}
/*
@@ -1739,17 +1519,17 @@ static int rcu_node_kthread(void *arg)
* no outgoing CPU. If there are no CPUs left in the affinity set,
* this function allows the kthread to execute on any CPU.
*/
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
+ struct task_struct *t = rnp->boost_kthread_task;
+ unsigned long mask = rnp->qsmaskinit;
cpumask_var_t cm;
int cpu;
- unsigned long mask = rnp->qsmaskinit;
- if (rnp->node_kthread_task == NULL)
+ if (!t)
return;
- if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
- cpumask_clear(cm);
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
if ((mask & 0x1) && cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
@@ -1759,62 +1539,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
cpumask_clear_cpu(cpu, cm);
WARN_ON_ONCE(cpumask_weight(cm) == 0);
}
- set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
- rcu_boost_kthread_setaffinity(rnp, cm);
+ set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held. So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
-{
- unsigned long flags;
- int rnp_index = rnp - &rsp->node[0];
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- rnp->qsmaskinit == 0)
- return 0;
- if (rnp->node_kthread_task == NULL) {
- t = kthread_create(rcu_node_kthread, (void *)rnp,
- "rcun/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rnp->node_kthread_task = t;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = 99;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- }
- return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
-}
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
/*
* Spawn all kthreads -- called as soon as the scheduler is running.
*/
static int __init rcu_spawn_kthreads(void)
{
- int cpu;
struct rcu_node *rnp;
+ int cpu;
rcu_scheduler_fully_active = 1;
- for_each_possible_cpu(cpu) {
+ for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
- if (cpu_online(cpu))
- (void)rcu_spawn_one_cpu_kthread(cpu);
- }
+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
rnp = rcu_get_root(rcu_state);
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
if (NUM_RCU_NODES > 1) {
rcu_for_each_leaf_node(rcu_state, rnp)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
}
return 0;
}
@@ -1826,11 +1580,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
struct rcu_node *rnp = rdp->mynode;
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active) {
- (void)rcu_spawn_one_cpu_kthread(cpu);
- if (rnp->node_kthread_task == NULL)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
+ if (rcu_scheduler_fully_active)
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1854,19 +1605,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void rcu_stop_cpu_kthread(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-}
-
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..6b4c76ba3529 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -83,11 +83,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->nxttail[RCU_WAIT_TAIL]],
".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c/%d ktl=%x",
+ seq_printf(m, " kt=%d/%c ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
rdp->cpu)),
- per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
#endif /* #ifdef CONFIG_RCU_BOOST */
seq_printf(m, " b=%ld", rdp->blimit);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c812a63aa783..ad9c3c3fb04d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1109,6 +1109,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
@@ -1713,6 +1715,9 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ p->se.avg.decay_count = 0;
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea9..b9d54d0d7bb0 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
- if (!se)
- return;
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->runnable_avg_period);
+ return;
+ }
+
+
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
@@ -85,6 +91,13 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.runnable_avg_period);
+ P(se->avg.usage_avg_sum);
+ P(se->avg.load_avg_contrib);
+ P(se->avg.decay_count);
+#endif
#undef PN
#undef P
}
@@ -206,14 +219,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
- SPLIT_NS(cfs_rq->load_avg));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
- SPLIT_NS(cfs_rq->load_period));
- SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
- cfs_rq->load_contribution);
- SEQ_printf(m, " .%-30s: %d\n", "load_tg",
- atomic_read(&cfs_rq->tg->load_weight));
+ SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
+ cfs_rq->runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
+ cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic64_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
+ cfs_rq->tg_load_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
+ cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
+ atomic_read(&cfs_rq->tg->runnable_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg",
+ atomic_read(&cfs_rq->tg->usage_avg));
#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..f705a87ac7b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -653,9 +653,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -675,10 +672,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -801,72 +794,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- int global_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long load_avg;
-
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
-
- if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
- }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
- u64 period = sysctl_sched_shares_window;
- u64 now, delta;
- unsigned long load = cfs_rq->load.weight;
-
- if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
- return;
-
- now = rq_of(cfs_rq)->clock_task;
- delta = now - cfs_rq->load_stamp;
-
- /* truncate load history at 4 idle periods */
- if (cfs_rq->load_stamp > cfs_rq->load_last &&
- now - cfs_rq->load_last > 4 * period) {
- cfs_rq->load_period = 0;
- cfs_rq->load_avg = 0;
- delta = period - 1;
- }
-
- cfs_rq->load_stamp = now;
- cfs_rq->load_unacc_exec_time = 0;
- cfs_rq->load_period += delta;
- if (load) {
- cfs_rq->load_last = now;
- cfs_rq->load_avg += delta * load;
- }
-
- /* consider updating load contribution on each fold or truncate */
- if (global_update || cfs_rq->load_period > period
- || !cfs_rq->load_period)
- update_cfs_rq_load_contribution(cfs_rq, global_update);
-
- while (cfs_rq->load_period > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (cfs_rq->load_period));
- cfs_rq->load_period /= 2;
- cfs_rq->load_avg /= 2;
- }
-
- if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- list_del_leaf_cfs_rq(cfs_rq);
-}
-
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
@@ -876,8 +804,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic_read(&tg->load_weight);
- tg_weight -= cfs_rq->load_contribution;
+ tg_weight = atomic64_read(&tg->load_avg);
+ tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -901,27 +829,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return shares;
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
- }
-}
# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
@@ -939,6 +851,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
@@ -958,18 +872,491 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 46742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 516 /* number of full periods it takes to produce max */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2db, 0xf5257d15, 0xefe4b99b, 0xeac0c6e7, 0xe5b906e7,
+ 0xe0ccdeec, 0xdbfbb797, 0xd744fcca, 0xd2a81d91, 0xce248c15, 0xc9b9bd86,
+ 0xc5672a11, 0xc12c4cca, 0xbd08a39f, 0xb8fbaf47, 0xb504f333, 0xb123f581,
+ 0xad583eea, 0xa9a15ab4, 0xa5fed6a9, 0xa2704303, 0x9ef53260, 0x9b8d39b9,
+ 0x9837f051, 0x94f4efa8, 0x91c3d373, 0x8ea4398b, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/* Precomputed \Sum y^k { 1<=k<=n } */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+ int local_n;
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* will be 32 bits if that's desirable */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ n %= LOAD_AVG_PERIOD;
+ }
+
+ val *= runnable_avg_yN_inv[local_n];
+ return SRR(val, 32);
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(int n)
{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
}
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/* We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p1
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable,
+ int running)
{
+ u64 delta, periods;
+ u32 runnable_contrib;
+ int delta_w, decayed = 0;
+
+ delta = now - sa->last_runnable_update;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ if (running)
+ sa->usage_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+ sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += runnable_contrib;
+ sa->runnable_avg_period += runnable_contrib;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ if (running)
+ sa->usage_avg_sum += delta;
+ sa->runnable_avg_period += delta;
+
+ return decayed;
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/* Synchronize an entity's decay with its parentin cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+ decays -= se->avg.decay_count;
+ if (!decays)
+ return 0;
+
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.decay_count += decays;
+
+ return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ s64 tg_contrib;
+
+ tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ tg_contrib -= cfs_rq->tg_load_contrib;
+
+ if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic64_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_contrib += tg_contrib;
+ }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long contrib, usage_contrib;
+
+ contrib = div_u64(sa->runnable_avg_sum << 12,
+ sa->runnable_avg_period + 1);
+ contrib -= cfs_rq->tg_runnable_contrib;
+
+ usage_contrib = div_u64(sa->usage_avg_sum << 12,
+ sa->runnable_avg_period + 1);
+ usage_contrib -= cfs_rq->tg_usage_contrib;
+
+ /*
+ * contrib/usage at this point represent deltas, only update if they
+ * are substantive.
+ */
+ if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+ (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
+ atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_contrib += contrib;
+
+ atomic_add(usage_contrib, &tg->usage_avg);
+ cfs_rq->tg_usage_contrib += usage_contrib;
+ }
+}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ struct task_group *tg = cfs_rq->tg;
+ int runnable_avg;
+
+ u64 contrib;
+
+ contrib = cfs_rq->tg_load_contrib * tg->shares;
+ se->avg.load_avg_contrib = div64_u64(contrib,
+ atomic64_read(&tg->load_avg) + 1);
+
+ /*
+ * Unlike a task-entity, a group entity may be using >=1 cpu globally.
+ * However, in the case that it's using <1 cpu we need to form a
+ * correction term so that we contribute the same load as a task of
+ * equal weight. (Global runnable time is taken as a fraction over
+ * 2^12.)
+ */
+ runnable_avg = atomic_read(&tg->runnable_avg);
+ if (runnable_avg < (1<<12)) {
+ se->avg.load_avg_contrib *= runnable_avg;
+ se->avg.load_avg_contrib /= (1<<12);
+ }
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+#endif
+
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
+ contrib = se->avg.runnable_avg_sum * scale_load_down(1024);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_ratio = scale_load(contrib);
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
+ contrib = se->avg.usage_avg_sum * scale_load_down(1024);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ trace_sched_task_usage_ratio(task_of(se), scale_load(contrib));
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.load_avg_contrib;
+
+ if (entity_is_task(se)) {
+ __update_task_entity_contrib(se);
+ } else {
+ __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ __update_group_entity_contrib(se);
+ }
+
+ return se->avg.load_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ long load_contrib)
+{
+ if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ cfs_rq->blocked_load_avg -= load_contrib;
+ else
+ cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ long contrib_delta;
+ u64 now;
+
+ /*
+ * For a group entity we need to use their owned cfs_rq_clock_task() in
+ * case they are the parent of a throttled hierarchy.
+ */
+ if (entity_is_task(se))
+ now = cfs_rq_clock_task(cfs_rq);
+ else
+ now = cfs_rq_clock_task(group_cfs_rq(se));
+
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+ cfs_rq->curr == se))
+ return;
+
+ contrib_delta = __update_entity_load_avg_contrib(se);
+
+ if (!update_cfs_rq)
+ return;
+
+ if (se->on_rq)
+ cfs_rq->runnable_load_avg += contrib_delta;
+ else
+ subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * they their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ u64 decays;
+
+ decays = now - cfs_rq->last_decay;
+ if (!decays && !force_update)
+ return;
+
+ if (atomic64_read(&cfs_rq->removed_load)) {
+ u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+ subtract_blocked_load_contrib(cfs_rq, removed_load);
+ }
+
+ if (decays) {
+ cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+ atomic64_add(decays, &cfs_rq->decay_counter);
+ cfs_rq->last_decay = now;
+ }
+
+ __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ update_cfs_shares(cfs_rq);
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ u32 contrib;
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+ runnable);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+ contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
+ contrib /= (rq->avg.runnable_avg_period + 1);
+ trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
+ trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup)
+{
+ /*
+ * We track migrations using entity decay_count <= 0, on a wake-up
+ * migration we use a negative decay count to track the remote decays
+ * accumulated while sleeping.
+ */
+ if (unlikely(se->avg.decay_count <= 0)) {
+ se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+ if (se->avg.decay_count) {
+ /*
+ * In a wake-up migration we have to approximate the
+ * time sleeping. This is because we can't synchronize
+ * clock_task between the two cpus, and it is not
+ * guaranteed to be read-safe. Instead, we can
+ * approximate this using our carried decays, which are
+ * explicitly atomically readable.
+ */
+ se->avg.last_runnable_update -= (-se->avg.decay_count)
+ << 20;
+ update_entity_load_avg(se, 0);
+ }
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ wakeup = 0;
+ } else {
+ __synchronize_entity_decay(se);
+ }
+
+ /* migrated tasks did not contribute to our blocked load */
+ if (wakeup) {
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ update_entity_load_avg(se, 0);
+ }
+
+ cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep)
+{
+ update_entity_load_avg(se, 1);
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+ cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ if (sleep) {
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ } else {
+ se->avg.decay_count = 0;
+ }
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update) {}
+#endif
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -1096,9 +1483,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- update_cfs_load(cfs_rq, 0);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -1190,9 +1576,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
- se->on_rq = 0;
- update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1206,7 +1591,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ se->on_rq = 0;
}
/*
@@ -1261,6 +1646,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -1340,6 +1726,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev, 1);
}
cfs_rq->curr = NULL;
}
@@ -1353,9 +1741,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);
/*
- * Update share accounting for long-running entities.
+ * Ensure that runnable average is periodically updated.
*/
- update_entity_shares_tick(cfs_rq);
+ update_entity_load_avg(curr, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1448,6 +1837,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task;
+
+ return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1592,14 +1990,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
- u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
- /* leaving throttled state, advance shares averaging windows */
- cfs_rq->load_stamp += delta;
- cfs_rq->load_last += delta;
-
- /* update entity weight now that we are on_rq again */
- update_cfs_shares(cfs_rq);
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq->clock_task -
+ cfs_rq->throttled_clock_task;
}
#endif
@@ -1611,9 +2004,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, record last load */
+ /* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
- update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttled_clock_task = rq->clock_task;
cfs_rq->throttle_count++;
return 0;
@@ -1628,7 +2021,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- /* account load preceding throttle */
+ /* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
@@ -1652,7 +2045,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rq->nr_running -= task_delta;
cfs_rq->throttled = 1;
- cfs_rq->throttled_timestamp = rq->clock;
+ cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2063,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
- cfs_rq->throttled_timestamp = 0;
update_rq_clock(rq);
/* update hierarchical throttle state */
@@ -2073,8 +2465,13 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_of(cfs_rq)->clock_task;
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2604,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
- if (!se)
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
+ }
hrtick_update(rq);
}
@@ -2266,12 +2665,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
- if (!se)
+ if (!se) {
dec_nr_running(rq);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2681,6 +3082,73 @@ done:
return target;
}
+#ifdef CONFIG_SCHED_HMP
+/* Heterogenous multiprocessor (HMP) optimizations
+ * We need to know which cpus that are fast and slow. Ideally, this
+ * information would be provided by the platform in some way. For now it is
+ * set in the kernel config. */
+static struct cpumask hmp_fast_cpu_mask;
+static struct cpumask hmp_slow_cpu_mask;
+
+/* Setup fast and slow cpumasks.
+ * This should be setup based on device tree somehow. */
+static int __init hmp_cpu_mask_setup(void)
+{
+ char buf[64];
+
+ cpumask_clear(&hmp_fast_cpu_mask);
+ cpumask_clear(&hmp_slow_cpu_mask);
+
+ if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, &hmp_fast_cpu_mask))
+ WARN(1, "Failed to parse HMP fast cpu mask!\n");
+ if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, &hmp_slow_cpu_mask))
+ WARN(1, "Failed to parse HMP slow cpu mask!\n");
+
+ printk(KERN_DEBUG "Initializing HMP scheduler:\n");
+ cpulist_scnprintf(buf, 64, &hmp_fast_cpu_mask);
+ printk(KERN_DEBUG " fast cpus: %s\n", buf);
+ cpulist_scnprintf(buf, 64, &hmp_slow_cpu_mask);
+ printk(KERN_DEBUG " slow cpus: %s\n", buf);
+
+ return 1;
+}
+early_initcall(hmp_cpu_mask_setup);
+
+/* Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a fast cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slow cpu
+ * hmp_up_prio: min. task prio for tasks migrating to faster cpus */
+unsigned int hmp_up_threshold = 512;
+unsigned int hmp_down_threshold = 256;
+unsigned int hmp_up_prio = 125;
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+
+static unsigned int hmp_cpu_is_fast(int cpu)
+{
+ return cpumask_test_cpu(cpu, &hmp_fast_cpu_mask);
+}
+
+static unsigned int hmp_cpu_is_slow(int cpu)
+{
+ return cpumask_test_cpu(cpu, &hmp_slow_cpu_mask);
+}
+
+/* Select target cpu for HMP migration to fast cpu
+ * returns target >= nr_cpu_ids if no fast cpus in affinity mask */
+static inline unsigned int hmp_select_fast_cpu(struct task_struct *tsk)
+{
+ return cpumask_any_and(&hmp_fast_cpu_mask, tsk_cpus_allowed(tsk));
+}
+
+/* Select target cpu for HMP migration to slow cpu
+ * returns target >= nr_cpu_ids if no slow cpus in affinity mask */
+static inline unsigned int hmp_select_slow_cpu(struct task_struct *tsk)
+{
+ return cpumask_any_and(&hmp_slow_cpu_mask, tsk_cpus_allowed(tsk));
+}
+#endif /* CONFIG_SCHED_HMP */
+
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -2807,8 +3275,52 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
unlock:
rcu_read_unlock();
+#ifdef CONFIG_SCHED_HMP
+ if (hmp_up_migration(new_cpu, &p->se)) {
+ cpu = hmp_select_fast_cpu(p);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+ if (hmp_down_migration(new_cpu, &p->se)) {
+ cpu = hmp_select_slow_cpu(p);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+#endif
+
return new_cpu;
}
+
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including rq->lock state, should be made.
+ * Caller guarantees p->pi_lock held, but nothing else.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu) {
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Load tracking: accumulate removed load so that it can be processed
+ * when we next update owning cfs_rq under rq->lock. Tasks contribute
+ * to blocked load iff they have a non-zero decay-count.
+ */
+ if (se->avg.decay_count) {
+ se->avg.decay_count = -__synchronize_entity_decay(se);
+ atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+ }
+}
+#endif
+
#endif /* CONFIG_SMP */
static unsigned long
@@ -3300,51 +3812,65 @@ next:
/*
* update tg->load_weight by folding this cpu's load_avg
*/
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
{
- struct cfs_rq *cfs_rq;
- unsigned long flags;
- struct rq *rq;
-
- if (!tg->se[cpu])
- return 0;
-
- rq = cpu_rq(cpu);
- cfs_rq = tg->cfs_rq[cpu];
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- update_rq_clock(rq);
- update_cfs_load(cfs_rq, 1);
+ struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
- /*
- * We need to update shares after updating tg->load_weight in
- * order to adjust the weight of groups with long running tasks.
- */
- update_cfs_shares(cfs_rq);
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ return;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
+ if (se)
+ update_entity_load_avg(se, 1);
+ else
+ update_rq_runnable_avg(rq_of(cfs_rq), 1);
- return 0;
+ if (se) {
+ /*
+ * We can pivot on the runnable average decaying to zero for
+ * list removal since the parent average will always be >=
+ * child.
+ */
+ if (se->avg.runnable_avg_sum)
+ update_cfs_shares(cfs_rq);
+ else
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
{
- struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq;
+
+ unsigned long flags;
+ int num_updates = 0;
rcu_read_lock();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
+ __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
- update_shares_cpu(cfs_rq->tg, cpu);
+ /*
+ * Periodically release the lock so that a cfs_rq with many
+ * children cannot hold it for an arbitrary period of time.
+ */
+ if (num_updates++ % 20 == 0) {
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ cpu_relax();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
+ }
}
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
rcu_read_unlock();
}
@@ -3389,7 +3915,7 @@ static unsigned long task_h_load(struct task_struct *p)
return load;
}
#else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
{
}
@@ -4409,12 +4935,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
+ update_rq_runnable_avg(this_rq, 1);
+
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
raw_spin_unlock(&this_rq->lock);
- update_shares(this_cpu);
+ update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -4674,7 +5202,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
int update_next_balance = 0;
int need_serialize;
- update_shares(cpu);
+ update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -4842,6 +5370,223 @@ need_kick:
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
#endif
+#ifdef CONFIG_SCHED_HMP
+/* Check if task should migrate to a faster core */
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ if (p->prio < hmp_up_prio && p->prio > 100
+ && hmp_cpu_is_slow(cpu)
+ && se->avg.load_avg_ratio > hmp_up_threshold) {
+ return 1;
+ }
+ return 0;
+}
+
+/* Check if task should migrate to a slower core */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ if (p->prio >= hmp_up_prio || (hmp_cpu_is_fast(cpu)
+ && se->avg.load_avg_ratio < hmp_down_threshold)) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot = 0;
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ return 0;
+ }
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
+ return 1;
+}
+
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+ struct task_struct *p, *n;
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+ env->dst_cpu))
+ continue;
+
+ if (!hmp_can_migrate_task(p, env))
+ continue;
+ /* Check if we found the right task */
+ if (p != pm)
+ continue;
+
+ move_task(p, env);
+ /*
+ * Right now, this is only the third place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance)) {
+ goto out_unlock;
+ }
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq)
+ goto out_unlock;
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ if (move_specific_task(&env, p))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ rcu_read_unlock();
+ double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/* hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu. */
+static void hmp_force_up_migration(int this_cpu)
+{
+ int i;
+ struct sched_entity *curr;
+ struct rq *target;
+ unsigned long flags;
+ unsigned int force;
+ struct task_struct *p;
+
+ if (!spin_trylock(&hmp_force_migration))
+ return;
+ for_each_cpu(i, &hmp_slow_cpu_mask) {
+ force = 0;
+ target = cpu_rq(i);
+ raw_spin_lock_irqsave(&target->lock, flags);
+ curr = target->cfs.curr;
+ if (!curr || !entity_is_task(curr)) {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ continue;
+ }
+ p = task_of(curr);
+ if (hmp_up_migration(i, curr)) {
+ if (!target->active_balance) {
+ target->active_balance = 1;
+ target->push_cpu = hmp_select_fast_cpu(p);
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, 1);
+ }
+ }
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ if (force)
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_active_task_migration_cpu_stop,
+ target, &target->active_balance_work);
+ }
+ spin_unlock(&hmp_force_migration);
+}
+#else
+static void hmp_force_up_migration(int this_cpu) { }
+#endif /* CONFIG_SCHED_HMP */
+
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -4853,6 +5598,8 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
+ hmp_force_up_migration(this_cpu);
+
rebalance_domains(this_cpu, idle);
/*
@@ -4907,6 +5654,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ update_rq_runnable_avg(rq, 1);
}
/*
@@ -4999,6 +5748,21 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ /*
+ * Remove our load from contribution when we leave sched_fair
+ * and ensure we don't carry in an old decay_count if we
+ * switch back.
+ */
+ if (p->se.avg.decay_count) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+ __synchronize_entity_decay(&p->se);
+ subtract_blocked_load_contrib(cfs_rq,
+ p->se.avg.load_avg_contrib);
+ p->se.avg.decay_count = 0;
+ }
+#endif
}
/*
@@ -5045,11 +5809,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ atomic64_set(&cfs_rq->decay_counter, 1);
+ atomic64_set(&cfs_rq->removed_load, 0);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct cfs_rq *cfs_rq;
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5081,8 +5850,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
if (!on_rq)
p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
set_task_rq(p, task_cpu(p));
- if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ if (!on_rq) {
+ cfs_rq = cfs_rq_of(&p->se);
+ p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+ /*
+ * set_task_rq will() have removed our previous contribution,
+ * but we must synchronize explicitly against further decay
+ * here.
+ */
+ p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+ }
}
void free_fair_sched_group(struct task_group *tg)
@@ -5167,10 +5947,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
cfs_rq->tg = tg;
cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
- /* allow initial update_cfs_load() to truncate */
- cfs_rq->load_stamp = 1;
-#endif
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
@@ -5217,8 +5993,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
update_cfs_shares(group_cfs_rq(se));
+ /* update contribution to parent */
+ update_entity_load_avg(se, 1);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -5272,7 +6051,9 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
-
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ .migrate_task_rq = migrate_task_rq_fair,
+#endif
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a486c5c6..d98ae909e321 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -42,7 +42,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
* Use arch dependent cpu power functions
*/
-SCHED_FEAT(ARCH_POWER, false)
+SCHED_FEAT(ARCH_POWER, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..0dc9bd108f57 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ struct task_group {
unsigned long shares;
atomic_t load_weight;
+ atomic64_t load_avg;
+ atomic_t runnable_avg, usage_avg;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +224,29 @@ struct cfs_rq {
unsigned int nr_spread_over;
#endif
+#ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
-
/*
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- * (like users, containers etc.)
- *
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
+ * CFS Load tracking
+ * Under CFS, load is tracked on a per-entity basis and aggregated up.
+ * This allows for the description of both thread and group usage (in
+ * the FAIR_GROUP_SCHED case).
*/
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ u64 runnable_load_avg, blocked_load_avg;
+ atomic64_t decay_counter, removed_load;
+ u64 last_decay;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+/* These always depend on CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ u32 tg_runnable_contrib, tg_usage_contrib;
+ u64 tg_load_contrib;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
/*
* h_load = weight * f(tg)
*
@@ -245,26 +254,30 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
- * Maintaining per-cpu shares distribution for group scheduling
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
*
- * load_stamp is the last time we updated the load average
- * load_last is the last time we updated the load average and saw load
- * load_unacc_exec_time is currently unaccounted execution time
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
*/
- u64 load_avg;
- u64 load_period;
- u64 load_stamp, load_last, load_unacc_exec_time;
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
- unsigned long load_contribution;
-#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
- u64 throttled_timestamp;
+ u64 throttled_clock, throttled_clock_task;
+ u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -408,6 +421,7 @@ struct rq {
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
+ struct task_struct *migrate_task;
/* cpu of this runqueue: */
int cpu;
int online;
@@ -463,6 +477,8 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
+
+ struct sched_avg avg;
};
static inline int cpu_of(struct rq *rq)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 98f60c5caa1b..1c1458f28e2a 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,11 +1,17 @@
/*
* Common SMP CPU bringup/teardown functions
*/
+#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/export.h>
#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/smpboot.h>
#include "smpboot.h"
@@ -65,3 +71,226 @@ void __init idle_threads_init(void)
}
}
#endif
+
+static LIST_HEAD(hotplug_threads);
+static DEFINE_MUTEX(smpboot_threads_lock);
+
+struct smpboot_thread_data {
+ unsigned int cpu;
+ unsigned int status;
+ struct smp_hotplug_thread *ht;
+};
+
+enum {
+ HP_THREAD_NONE = 0,
+ HP_THREAD_ACTIVE,
+ HP_THREAD_PARKED,
+};
+
+/**
+ * smpboot_thread_fn - percpu hotplug thread loop function
+ * @void: thread data pointer
+ *
+ * Checks for thread stop and park conditions. Calls the necessary
+ * setup, cleanup, park and unpark functions for the registered
+ * thread.
+ *
+ * Returns 1 when the thread should exit, 0 otherwise.
+ */
+static int smpboot_thread_fn(void *data)
+{
+ struct smpboot_thread_data *td = data;
+ struct smp_hotplug_thread *ht = td->ht;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ preempt_disable();
+ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->cleanup)
+ ht->cleanup(td->cpu, cpu_online(td->cpu));
+ kfree(td);
+ return 0;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->park && td->status == HP_THREAD_ACTIVE) {
+ BUG_ON(td->cpu != smp_processor_id());
+ ht->park(td->cpu);
+ td->status = HP_THREAD_PARKED;
+ }
+ kthread_parkme();
+ /* We might have been woken for stop */
+ continue;
+ }
+
+ BUG_ON(td->cpu != smp_processor_id());
+
+ /* Check for state change setup */
+ switch (td->status) {
+ case HP_THREAD_NONE:
+ preempt_enable();
+ if (ht->setup)
+ ht->setup(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ preempt_disable();
+ break;
+ case HP_THREAD_PARKED:
+ preempt_enable();
+ if (ht->unpark)
+ ht->unpark(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ preempt_disable();
+ break;
+ }
+
+ if (!ht->thread_should_run(td->cpu)) {
+ schedule_preempt_disabled();
+ } else {
+ set_current_state(TASK_RUNNING);
+ preempt_enable();
+ ht->thread_fn(td->cpu);
+ preempt_disable();
+ }
+ }
+}
+
+static int
+__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ struct smpboot_thread_data *td;
+
+ if (tsk)
+ return 0;
+
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
+ if (!td)
+ return -ENOMEM;
+ td->cpu = cpu;
+ td->ht = ht;
+
+ tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
+ ht->thread_comm);
+ if (IS_ERR(tsk)) {
+ kfree(td);
+ return PTR_ERR(tsk);
+ }
+
+ get_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = tsk;
+ return 0;
+}
+
+int smpboot_create_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list) {
+ ret = __smpboot_create_thread(cur, cpu);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+
+static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ kthread_unpark(tsk);
+}
+
+void smpboot_unpark_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list)
+ smpboot_unpark_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk)
+ kthread_park(tsk);
+}
+
+void smpboot_park_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry_reverse(cur, &hotplug_threads, list)
+ smpboot_park_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
+{
+ unsigned int cpu;
+
+ /* We need to destroy also the parked threads of offline cpus */
+ for_each_possible_cpu(cpu) {
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk) {
+ kthread_stop(tsk);
+ put_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = NULL;
+ }
+ }
+}
+
+/**
+ * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Creates and starts the threads on all online cpus.
+ */
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ unsigned int cpu;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ for_each_online_cpu(cpu) {
+ ret = __smpboot_create_thread(plug_thread, cpu);
+ if (ret) {
+ smpboot_destroy_threads(plug_thread);
+ goto out;
+ }
+ smpboot_unpark_thread(plug_thread, cpu);
+ }
+ list_add(&plug_thread->list, &hotplug_threads);
+out:
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+
+/**
+ * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Stops all threads on all possible cpus.
+ */
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+ list_del(&plug_thread->list);
+ smpboot_destroy_threads(plug_thread);
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb8472..9e26228edf66 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -15,4 +15,8 @@ static inline void idle_thread_set_boot_cpu(void) { }
static inline void idle_threads_init(void) { }
#endif
+int smpboot_create_threads(unsigned int cpu);
+void smpboot_park_threads(unsigned int cpu);
+void smpboot_unpark_threads(unsigned int cpu);
+
#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..82ca065019eb 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,6 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
+#include <linux/smpboot.h>
#include <linux/tick.h>
#define CREATE_TRACE_POINTS
@@ -733,49 +734,21 @@ void __init softirq_init(void)
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
-static int run_ksoftirqd(void * __bind_cpu)
+static int ksoftirqd_should_run(unsigned int cpu)
{
- set_current_state(TASK_INTERRUPTIBLE);
-
- while (!kthread_should_stop()) {
- preempt_disable();
- if (!local_softirq_pending()) {
- schedule_preempt_disabled();
- }
-
- __set_current_state(TASK_RUNNING);
-
- while (local_softirq_pending()) {
- /* Preempt disable stops cpu going offline.
- If already offline, we'll be on wrong CPU:
- don't process */
- if (cpu_is_offline((long)__bind_cpu))
- goto wait_to_die;
- local_irq_disable();
- if (local_softirq_pending())
- __do_softirq();
- local_irq_enable();
- sched_preempt_enable_no_resched();
- cond_resched();
- preempt_disable();
- rcu_note_context_switch((long)__bind_cpu);
- }
- preempt_enable();
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
- return 0;
+ return local_softirq_pending();
+}
-wait_to_die:
- preempt_enable();
- /* Wait for kthread_stop */
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
+static void run_ksoftirqd(unsigned int cpu)
+{
+ local_irq_disable();
+ if (local_softirq_pending()) {
+ __do_softirq();
+ local_irq_enable();
+ cond_resched();
+ rcu_note_context_switch(cpu);
}
- __set_current_state(TASK_RUNNING);
- return 0;
+ local_irq_enable();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -841,50 +814,17 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- int hotcpu = (unsigned long)hcpu;
- struct task_struct *p;
-
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- p = kthread_create_on_node(run_ksoftirqd,
- hcpu,
- cpu_to_node(hotcpu),
- "ksoftirqd/%d", hotcpu);
- if (IS_ERR(p)) {
- printk("ksoftirqd for %i failed\n", hotcpu);
- return notifier_from_errno(PTR_ERR(p));
- }
- kthread_bind(p, hotcpu);
- per_cpu(ksoftirqd, hotcpu) = p;
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- wake_up_process(per_cpu(ksoftirqd, hotcpu));
- break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!per_cpu(ksoftirqd, hotcpu))
- break;
- /* Unbind so it can run. Fall thru. */
- kthread_bind(per_cpu(ksoftirqd, hotcpu),
- cpumask_any(cpu_online_mask));
case CPU_DEAD:
case CPU_DEAD_FROZEN: {
- static const struct sched_param param = {
- .sched_priority = MAX_RT_PRIO-1
- };
-
- p = per_cpu(ksoftirqd, hotcpu);
- per_cpu(ksoftirqd, hotcpu) = NULL;
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
- kthread_stop(p);
+ int hotcpu = (unsigned long)hcpu;
+
takeover_tasklets(hotcpu);
break;
}
#endif /* CONFIG_HOTPLUG_CPU */
- }
+ }
return NOTIFY_OK;
}
@@ -892,14 +832,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
+static struct smp_hotplug_thread softirq_threads = {
+ .store = &ksoftirqd,
+ .thread_should_run = ksoftirqd_should_run,
+ .thread_fn = run_ksoftirqd,
+ .thread_comm = "ksoftirqd/%u",
+};
+
static __init int spawn_ksoftirqd(void)
{
- void *cpu = (void *)(long)smp_processor_id();
- int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-
- BUG_ON(err != NOTIFY_OK);
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
+
+ BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+
return 0;
}
early_initcall(spawn_ksoftirqd);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..9d4c8d5a1f53 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -22,6 +22,7 @@
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/smpboot.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -29,16 +30,18 @@
int watchdog_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_disabled;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
@@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
__this_cpu_write(hard_watchdog_warn, false);
return;
}
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
}
-#else
-static inline void watchdog_interrupt_count(void) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static int watchdog_nmi_enable(unsigned int cpu);
+static void watchdog_nmi_disable(unsigned int cpu);
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
+static void watchdog_set_prio(unsigned int policy, unsigned int prio)
+{
+ struct sched_param param = { .sched_priority = prio };
-/*
- * The watchdog thread - touches the timestamp.
- */
-static int watchdog(void *unused)
+ sched_setscheduler(current, policy, &param);
+}
+
+static void watchdog_enable(unsigned int cpu)
{
- struct sched_param param = { .sched_priority = 0 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- /* initialize timestamp */
- __touch_watchdog();
+ if (!watchdog_enabled) {
+ kthread_park(current);
+ return;
+ }
+
+ /* Enable the perf event */
+ watchdog_nmi_enable(cpu);
/* kick off the timer for the hardlockup detector */
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = watchdog_timer_fn;
+
/* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
HRTIMER_MODE_REL_PINNED);
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Run briefly (kicked by the hrtimer callback function) once every
- * get_sample_period() seconds (4 seconds by default) to reset the
- * softlockup timestamp. If this gets delayed for more than
- * 2*watchdog_thresh seconds then the debug-printout triggers in
- * watchdog_timer_fn().
- */
- while (!kthread_should_stop()) {
- __touch_watchdog();
- schedule();
+ /* initialize timestamp */
+ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+ __touch_watchdog();
+}
- if (kthread_should_stop())
- break;
+static void watchdog_disable(unsigned int cpu)
+{
+ struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- set_current_state(TASK_INTERRUPTIBLE);
- }
- /*
- * Drop the policy/priority elevation during thread exit to avoid a
- * scheduling latency spike.
- */
- __set_current_state(TASK_RUNNING);
- sched_setscheduler(current, SCHED_NORMAL, &param);
- return 0;
+ watchdog_set_prio(SCHED_NORMAL, 0);
+ hrtimer_cancel(hrtimer);
+ /* disable the perf event */
+ watchdog_nmi_disable(cpu);
}
+static int watchdog_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(hrtimer_interrupts) !=
+ __this_cpu_read(soft_lockup_hrtimer_cnt);
+}
+
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every get_sample_period() seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static void watchdog(unsigned int cpu)
+{
+ __this_cpu_write(soft_lockup_hrtimer_cnt,
+ __this_cpu_read(hrtimer_interrupts));
+ __touch_watchdog();
+}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
/*
@@ -379,7 +403,7 @@ static int watchdog(void *unused)
*/
static unsigned long cpu0_err;
-static int watchdog_nmi_enable(int cpu)
+static int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -433,7 +457,7 @@ out:
return 0;
}
-static void watchdog_nmi_disable(int cpu)
+static void watchdog_nmi_disable(unsigned int cpu)
{
struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu)
return;
}
#else
-static int watchdog_nmi_enable(int cpu) { return 0; }
-static void watchdog_nmi_disable(int cpu) { return; }
+static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+static void watchdog_nmi_disable(unsigned int cpu) { return; }
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/* prepare/enable/disable routines */
-static void watchdog_prepare_cpu(int cpu)
-{
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
- WARN_ON(per_cpu(softlockup_watchdog, cpu));
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = watchdog_timer_fn;
-}
-
-static int watchdog_enable(int cpu)
-{
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- int err = 0;
-
- /* enable the perf event */
- err = watchdog_nmi_enable(cpu);
-
- /* Regardless of err above, fall through and start softlockup */
-
- /* create the watchdog thread */
- if (!p) {
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
- if (IS_ERR(p)) {
- pr_err("softlockup watchdog for %i failed\n", cpu);
- if (!err) {
- /* if hardlockup hasn't already set this */
- err = PTR_ERR(p);
- /* and disable the perf event */
- watchdog_nmi_disable(cpu);
- }
- goto out;
- }
- sched_setscheduler(p, SCHED_FIFO, &param);
- kthread_bind(p, cpu);
- per_cpu(watchdog_touch_ts, cpu) = 0;
- per_cpu(softlockup_watchdog, cpu) = p;
- wake_up_process(p);
- }
-
-out:
- return err;
-}
-
-static void watchdog_disable(int cpu)
-{
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
- /*
- * cancel the timer first to stop incrementing the stats
- * and waking up the kthread
- */
- hrtimer_cancel(hrtimer);
-
- /* disable the perf event */
- watchdog_nmi_disable(cpu);
-
- /* stop the watchdog thread */
- if (p) {
- per_cpu(softlockup_watchdog, cpu) = NULL;
- kthread_stop(p);
- }
-}
-
/* sysctl functions */
#ifdef CONFIG_SYSCTL
static void watchdog_enable_all_cpus(void)
{
- int cpu;
-
- watchdog_enabled = 0;
-
- for_each_online_cpu(cpu)
- if (!watchdog_enable(cpu))
- /* if any cpu succeeds, watchdog is considered
- enabled for the system */
- watchdog_enabled = 1;
-
- if (!watchdog_enabled)
- pr_err("failed to be enabled on some cpus\n");
+ unsigned int cpu;
+ if (watchdog_disabled) {
+ watchdog_disabled = 0;
+ for_each_online_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ }
}
static void watchdog_disable_all_cpus(void)
{
- int cpu;
-
- for_each_online_cpu(cpu)
- watchdog_disable(cpu);
+ unsigned int cpu;
- /* if all watchdogs are disabled, then they are disabled for the system */
- watchdog_enabled = 0;
+ if (!watchdog_disabled) {
+ watchdog_disabled = 1;
+ for_each_online_cpu(cpu)
+ kthread_park(per_cpu(softlockup_watchdog, cpu));
+ }
}
-
/*
* proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
*/
@@ -557,73 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write,
{
int ret;
+ if (watchdog_disabled < 0)
+ return -ENODEV;
+
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
- goto out;
+ return ret;
if (watchdog_enabled && watchdog_thresh)
watchdog_enable_all_cpus();
else
watchdog_disable_all_cpus();
-out:
return ret;
}
#endif /* CONFIG_SYSCTL */
-
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int __cpuinit
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int hotcpu = (unsigned long)hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- watchdog_prepare_cpu(hotcpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- if (watchdog_enabled)
- watchdog_enable(hotcpu);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- watchdog_disable(hotcpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- watchdog_disable(hotcpu);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
-
- /*
- * hardlockup and softlockup are not important enough
- * to block cpu bring up. Just always succeed and
- * rely on printk output to flag problems.
- */
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cpu_nfb = {
- .notifier_call = cpu_callback
+static struct smp_hotplug_thread watchdog_threads = {
+ .store = &softlockup_watchdog,
+ .thread_should_run = watchdog_should_run,
+ .thread_fn = watchdog,
+ .thread_comm = "watchdog/%u",
+ .setup = watchdog_enable,
+ .park = watchdog_disable,
+ .unpark = watchdog_enable,
};
void __init lockup_detector_init(void)
{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- WARN_ON(notifier_to_errno(err));
-
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
- register_cpu_notifier(&cpu_nfb);
-
- return;
+ if (smpboot_register_percpu_thread(&watchdog_threads)) {
+ pr_err("Failed to create watchdog threads, disabled\n");
+ watchdog_disabled = -ENODEV;
+ }
}
diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf
new file mode 100644
index 000000000000..df35474eff10
--- /dev/null
+++ b/linaro/configs/big-LITTLE-MP.conf
@@ -0,0 +1,9 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_NO_HZ=y
+CONFIG_SCHED_MC=y
+CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y
+CONFIG_SCHED_HMP=y
+CONFIG_HMP_FAST_CPU_MASK="0-1"
+CONFIG_HMP_SLOW_CPU_MASK="2-3"