aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt9
-rw-r--r--arch/arm/Kconfig85
-rw-r--r--arch/arm/include/asm/topology.h31
-rw-r--r--arch/arm/kernel/hw_breakpoint.c57
-rw-r--r--arch/arm/kernel/topology.c98
-rw-r--r--arch/ia64/include/asm/topology.h1
-rw-r--r--arch/tile/include/asm/topology.h1
-rw-r--r--include/linux/sched.h29
-rw-r--r--include/linux/topology.h3
-rw-r--r--include/trace/events/sched.h153
-rw-r--r--kernel/irq/irqdesc.c21
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/debug.c39
-rw-r--r--kernel/sched/fair.c1951
-rw-r--r--kernel/sched/sched.h65
-rw-r--r--linaro/configs/big-LITTLE-MP.conf13
16 files changed, 2369 insertions, 203 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9776f068306..7a0e553c82b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1165,6 +1165,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
See comment before ip2_setup() in
drivers/char/ip2/ip2base.c.
+ irqaffinity= [SMP] Set the default irq affinity mask
+ Format:
+ <cpu number>,...,<cpu number>
+ or
+ <cpu number>-<cpu number>
+ (must be a positive range in ascending order)
+ or a mixture
+ <cpu number>,...,<cpu number>-<cpu number>
+
irqfixup [HW]
When an interrupt is not handled search all handlers
for it. Intended to get systems with badly broken
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index ade7e924bef..f8b7b7f31da 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1556,6 +1556,91 @@ config SCHED_SMT
MultiThreading at a cost of slightly increased overhead in some
places. If unsure say N here.
+config DISABLE_CPU_SCHED_DOMAIN_BALANCE
+ bool "(EXPERIMENTAL) Disable CPU level scheduler load-balancing"
+ help
+ Disables scheduler load-balancing at CPU sched domain level.
+
+config SCHED_HMP
+ bool "(EXPERIMENTAL) Heterogenous multiprocessor scheduling"
+ depends on DISABLE_CPU_SCHED_DOMAIN_BALANCE && SCHED_MC && FAIR_GROUP_SCHED && !SCHED_AUTOGROUP
+ help
+ Experimental scheduler optimizations for heterogeneous platforms.
+ Attempts to introspectively select task affinity to optimize power
+ and performance. Basic support for multiple (>2) cpu types is in place,
+ but it has only been tested with two types of cpus.
+ There is currently no support for migration of task groups, hence
+ !SCHED_AUTOGROUP. Furthermore, normal load-balancing must be disabled
+ between cpus of different type (DISABLE_CPU_SCHED_DOMAIN_BALANCE).
+
+config SCHED_HMP_PRIO_FILTER
+ bool "(EXPERIMENTAL) Filter HMP migrations by task priority"
+ depends on SCHED_HMP
+ default y
+ help
+ Enables task priority based HMP migration filter. Any task with
+ a NICE value above the threshold will always be on low-power cpus
+ with less compute capacity.
+
+config SCHED_HMP_PRIO_FILTER_VAL
+ int "NICE priority threshold"
+ default 5
+ depends on SCHED_HMP_PRIO_FILTER
+
+config HMP_FAST_CPU_MASK
+ string "HMP scheduler fast CPU mask"
+ depends on SCHED_HMP
+ help
+ Leave empty to use device tree information.
+ Specify the cpuids of the fast CPUs in the system as a list string,
+ e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_SLOW_CPU_MASK
+ string "HMP scheduler slow CPU mask"
+ depends on SCHED_HMP
+ help
+ Leave empty to use device tree information.
+ Specify the cpuids of the slow CPUs in the system as a list string,
+ e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_VARIABLE_SCALE
+ bool "Allows changing the load tracking scale through sysfs"
+ depends on SCHED_HMP
+ help
+ When turned on, this option exports the thresholds and load average
+ period value for the load tracking patches through sysfs.
+ The values can be modified to change the rate of load accumulation
+ and the thresholds used for HMP migration.
+ The load_avg_period_ms is the time in ms to reach a load average of
+ 0.5 for an idle task of 0 load average ratio that start a busy loop.
+ The up_threshold and down_threshold is the value to go to a faster
+ CPU or to go back to a slower cpu.
+ The {up,down}_threshold are devided by 1024 before being compared
+ to the load average.
+ For examples, with load_avg_period_ms = 128 and up_threshold = 512,
+ a running task with a load of 0 will be migrated to a bigger CPU after
+ 128ms, because after 128ms its load_avg_ratio is 0.5 and the real
+ up_threshold is 0.5.
+ This patch has the same behavior as changing the Y of the load
+ average computation to
+ (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms)
+ but it remove intermadiate overflows in computation.
+
+config HMP_FREQUENCY_INVARIANT_SCALE
+ bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP"
+ depends on HMP_VARIABLE_SCALE && CPU_FREQ
+ help
+ Scales the current load contribution in line with the frequency
+ of the CPU that the task was executed on.
+ In this version, we use a simple linear scale derived from the
+ maximum frequency reported by CPUFreq.
+ Restricting tracked load to be scaled by the CPU's frequency
+ represents the consumption of possible compute capacity
+ (rather than consumption of actual instantaneous capacity as
+ normal) and allows the HMP migration's simple threshold
+ migration strategy to interact more predictably with CPUFreq's
+ asynchronous compute capacity changes.
+
config HAVE_ARM_SCU
bool
help
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 611edefaeaf..983fa7c153a 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -28,6 +28,37 @@ void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask);
+#ifdef CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE
+/* Common values for CPUs */
+#ifndef SD_CPU_INIT
+#define SD_CPU_INIT (struct sched_domain) { \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 64, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = 1, \
+ .busy_idx = 2, \
+ .idle_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
+ \
+ .flags = 0*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 0*SD_BALANCE_WAKE \
+ | 1*SD_WAKE_AFFINE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 0*SD_SHARE_PKG_RESOURCES \
+ | 0*SD_SERIALIZE \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+}
+#endif
+#endif /* CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE */
+
#else
static inline void init_cpu_topology(void) { }
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 281bf330124..eed4d0cdd74 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -28,6 +28,7 @@
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/smp.h>
+#include <linux/cpu_pm.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
@@ -42,6 +43,11 @@ static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]);
/* Watchpoint currently in use for each WRP. */
static DEFINE_PER_CPU(struct perf_event *, wp_on_reg[ARM_MAX_WRP]);
+#ifdef CONFIG_CPU_PM
+/* Storage for OS Save and Restore. */
+static DEFINE_PER_CPU(u32, cpu_dscr);
+#endif
+
/* Number of BRP/WRP registers on this CPU. */
static int core_num_brps;
static int core_num_wrps;
@@ -990,6 +996,55 @@ static struct notifier_block __cpuinitdata dbg_reset_nb = {
.notifier_call = dbg_reset_notify,
};
+#ifdef CONFIG_CPU_PM
+static void os_save(int cpu)
+{
+ /* Set OS Lock. */
+ asm volatile("mcr p14, 0, %0, c1, c0, 4" : : "r" (0xC5ACCE55));
+ isb();
+
+ /* Save DSCRext. */
+ ARM_DBG_READ(c2, 2, per_cpu(cpu_dscr, cpu));
+}
+
+static void os_restore(int cpu)
+{
+ /* Restore DSCRext. */
+ ARM_DBG_WRITE(c2, 2, per_cpu(cpu_dscr, cpu));
+
+ /* Clear OS Lock. */
+ asm volatile("mcr p14, 0, %0, c1, c0, 4" : : "r" (0));
+ isb();
+}
+
+static int dbg_cpu_pm_notify(struct notifier_block *self, unsigned long action,
+ void *v)
+{
+ int cpu = smp_processor_id();
+
+ if (action == CPU_PM_ENTER)
+ os_save(cpu);
+ else if (action == CPU_PM_EXIT)
+ os_restore(cpu);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata dbg_cpu_pm_nb = {
+ .notifier_call = dbg_cpu_pm_notify,
+};
+
+static void __init pm_init(void)
+{
+ if (get_debug_arch() == ARM_DEBUG_ARCH_V7_1)
+ cpu_pm_register_notifier(&dbg_cpu_pm_nb);
+}
+#else
+static inline void pm_init(void)
+{
+}
+#endif
+
static int __init arch_hw_breakpoint_init(void)
{
u32 dscr;
@@ -1048,6 +1103,8 @@ static int __init arch_hw_breakpoint_init(void)
/* Register hotplug notifier. */
register_cpu_notifier(&dbg_reset_nb);
+
+ pm_init();
return 0;
}
arch_initcall(arch_hw_breakpoint_init);
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index d25a59657d5..4d34e0e7e94 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -226,6 +226,11 @@ static inline void update_cpu_power(unsigned int cpuid, unsigned int mpidr) {}
*/
struct cputopo_arm cpu_topology[NR_CPUS];
+int arch_sd_share_power_line(void)
+{
+ return 1*SD_SHARE_POWERLINE;
+}
+
const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
@@ -317,6 +322,99 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
}
+
+#ifdef CONFIG_SCHED_HMP
+
+static const char * const little_cores[] = {
+ "arm,cortex-a7",
+ NULL,
+};
+
+static bool is_little_cpu(struct device_node *cn)
+{
+ const char * const *lc;
+ for (lc = little_cores; *lc; lc++)
+ if (of_device_is_compatible(cn, *lc))
+ return true;
+ return false;
+}
+
+void __init arch_get_fast_and_slow_cpus(struct cpumask *fast,
+ struct cpumask *slow)
+{
+ struct device_node *cn = NULL;
+ int cpu = 0;
+
+ cpumask_clear(fast);
+ cpumask_clear(slow);
+
+ /*
+ * Use the config options if they are given. This helps testing
+ * HMP scheduling on systems without a big.LITTLE architecture.
+ */
+ if (strlen(CONFIG_HMP_FAST_CPU_MASK) && strlen(CONFIG_HMP_SLOW_CPU_MASK)) {
+ if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, fast))
+ WARN(1, "Failed to parse HMP fast cpu mask!\n");
+ if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, slow))
+ WARN(1, "Failed to parse HMP slow cpu mask!\n");
+ return;
+ }
+
+ /*
+ * Else, parse device tree for little cores.
+ */
+ while ((cn = of_find_node_by_type(cn, "cpu"))) {
+
+ if (cpu >= num_possible_cpus())
+ break;
+
+ if (is_little_cpu(cn))
+ cpumask_set_cpu(cpu, slow);
+ else
+ cpumask_set_cpu(cpu, fast);
+
+ cpu++;
+ }
+
+ if (!cpumask_empty(fast) && !cpumask_empty(slow))
+ return;
+
+ /*
+ * We didn't find both big and little cores so let's call all cores
+ * fast as this will keep the system running, with all cores being
+ * treated equal.
+ */
+ cpumask_setall(fast);
+ cpumask_clear(slow);
+}
+
+void __init arch_get_hmp_domains(struct list_head *hmp_domains_list)
+{
+ struct cpumask hmp_fast_cpu_mask;
+ struct cpumask hmp_slow_cpu_mask;
+ struct hmp_domain *domain;
+
+ arch_get_fast_and_slow_cpus(&hmp_fast_cpu_mask, &hmp_slow_cpu_mask);
+
+ /*
+ * Initialize hmp_domains
+ * Must be ordered with respect to compute capacity.
+ * Fastest domain at head of list.
+ */
+ if(!cpumask_empty(&hmp_slow_cpu_mask)) {
+ domain = (struct hmp_domain *)
+ kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);
+ cpumask_copy(&domain->cpus, &hmp_slow_cpu_mask);
+ list_add(&domain->hmp_domains, hmp_domains_list);
+ }
+ domain = (struct hmp_domain *)
+ kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);
+ cpumask_copy(&domain->cpus, &hmp_fast_cpu_mask);
+ list_add(&domain->hmp_domains, hmp_domains_list);
+}
+#endif /* CONFIG_SCHED_HMP */
+
+
/*
* cluster_to_logical_mask - return cpu logical mask of CPUs in a cluster
* @socket_id: cluster HW identifier
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index a2496e449b7..065c7209854 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -65,6 +65,7 @@ void build_cpu_to_node_map(void);
| SD_BALANCE_EXEC \
| SD_BALANCE_FORK \
| SD_WAKE_AFFINE, \
+ | arch_sd_share_power_line() \
.last_balance = jiffies, \
.balance_interval = 1, \
.nr_balance_failed = 0, \
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index d5e86c9f74f..7e9bdfac6f6 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -71,6 +71,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
| 0*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_share_power_line() \
| 0*SD_SERIALIZE \
, \
.last_balance = jiffies, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2..01eea702e35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -818,6 +818,7 @@ enum cpu_idle_type {
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
+#define SD_SHARE_POWERLINE 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
@@ -994,6 +995,12 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
bool cpus_share_cache(int this_cpu, int that_cpu);
+#ifdef CONFIG_SCHED_HMP
+struct hmp_domain {
+ struct cpumask cpus;
+ struct list_head hmp_domains;
+};
+#endif /* CONFIG_SCHED_HMP */
#else /* CONFIG_SMP */
struct sched_domain_attr;
@@ -1061,6 +1068,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
@@ -1095,6 +1103,24 @@ struct load_weight {
unsigned long weight, inv_weight;
};
+struct sched_avg {
+ /*
+ * These sums represent an infinite geometric series and so are bound
+ * above by 1024/(1-y). Thus we only need a u32 to store them for for all
+ * choices of y < 1-2^(-32)*1024.
+ */
+ u32 runnable_avg_sum, runnable_avg_period;
+ u64 last_runnable_update;
+ s64 decay_count;
+ unsigned long load_avg_contrib;
+ unsigned long load_avg_ratio;
+#ifdef CONFIG_SCHED_HMP
+ u64 hmp_last_up_migration;
+ u64 hmp_last_down_migration;
+#endif
+ u32 usage_avg_sum;
+};
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
@@ -1155,6 +1181,9 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
+#ifdef CONFIG_SMP
+ struct sched_avg avg;
+#endif
};
struct sched_rt_entity {
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d3cf0d6e771..8e958b2d938 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -99,6 +99,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 1*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_share_power_line() \
| 0*SD_SERIALIZE \
| 0*SD_PREFER_SIBLING \
| arch_sd_sibling_asym_packing() \
@@ -131,6 +132,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_share_power_line() \
| 0*SD_SERIALIZE \
, \
.last_balance = jiffies, \
@@ -161,6 +163,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_share_power_line() \
| 0*SD_SERIALIZE \
| 1*SD_PREFER_SIBLING \
, \
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 5a8671e8a67..501aa32eb2f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,159 @@ TRACE_EVENT(sched_pi_setprio,
__entry->oldprio, __entry->newprio)
);
+/*
+ * Tracepoint for showing tracked load contribution.
+ */
+TRACE_EVENT(sched_task_load_contrib,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long load_contrib),
+
+ TP_ARGS(tsk, load_contrib),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, load_contrib)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->load_contrib = load_contrib;
+ ),
+
+ TP_printk("comm=%s pid=%d load_contrib=%lu",
+ __entry->comm, __entry->pid,
+ __entry->load_contrib)
+);
+
+/*
+ * Tracepoint for showing tracked task runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_runnable_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_rq_runnable_ratio,
+
+ TP_PROTO(int cpu, unsigned long ratio),
+
+ TP_ARGS(cpu, ratio),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("cpu=%d ratio=%lu",
+ __entry->cpu,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable load.
+ */
+TRACE_EVENT(sched_rq_runnable_load,
+
+ TP_PROTO(int cpu, u64 load),
+
+ TP_ARGS(cpu, load),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(u64, load)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ ),
+
+ TP_printk("cpu=%d load=%llu",
+ __entry->cpu,
+ __entry->load)
+);
+
+/*
+ * Tracepoint for showing tracked task cpu usage ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_usage_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations.
+ */
+TRACE_EVENT(sched_hmp_migrate,
+
+ TP_PROTO(struct task_struct *tsk, int dest, int force),
+
+ TP_ARGS(tsk, dest, force),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, dest)
+ __field(int, force)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->dest = dest;
+ __entry->force = force;
+ ),
+
+ TP_printk("comm=%s pid=%d dest=%d force=%d",
+ __entry->comm, __entry->pid,
+ __entry->dest, __entry->force)
+);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302d6cf..473b2b6eccb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -23,10 +23,27 @@
static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ cpulist_parse(str, irq_default_affinity);
+ /*
+ * Set at least the boot cpu. We don't want to end up with
+ * bugreports caused by random comandline masks
+ */
+ cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+ return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
static void __init init_irq_default_affinity(void)
{
- alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
- cpumask_setall(irq_default_affinity);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!irq_default_affinity)
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#endif
+ if (cpumask_empty(irq_default_affinity))
+ cpumask_setall(irq_default_affinity);
}
#else
static void __init init_irq_default_affinity(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda71..e34e55868f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
@@ -1524,6 +1526,14 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_SMP
+ p->se.avg.runnable_avg_period = 0;
+ p->se.avg.runnable_avg_sum = 0;
+#ifdef CONFIG_SCHED_HMP
+ p->se.avg.hmp_last_up_migration = 0;
+ p->se.avg.hmp_last_down_migration = 0;
+#endif
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -5537,6 +5547,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rcu_assign_pointer(rq->sd, sd);
destroy_sched_domains(tmp, cpu);
+ update_packing_domain(cpu);
update_top_cache_domain(cpu);
}
@@ -5813,6 +5824,11 @@ int __weak arch_sd_sibling_asym_packing(void)
return 0*SD_ASYM_PACKING;
}
+int __weak arch_sd_share_power_line(void)
+{
+ return 1*SD_SHARE_POWERLINE;
+}
+
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea..b9d54d0d7bb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
- if (!se)
- return;
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->runnable_avg_period);
+ return;
+ }
+
+
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
@@ -85,6 +91,13 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.runnable_avg_period);
+ P(se->avg.usage_avg_sum);
+ P(se->avg.load_avg_contrib);
+ P(se->avg.decay_count);
+#endif
#undef PN
#undef P
}
@@ -206,14 +219,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
- SPLIT_NS(cfs_rq->load_avg));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
- SPLIT_NS(cfs_rq->load_period));
- SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
- cfs_rq->load_contribution);
- SEQ_printf(m, " .%-30s: %d\n", "load_tg",
- atomic_read(&cfs_rq->tg->load_weight));
+ SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
+ cfs_rq->runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
+ cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic64_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
+ cfs_rq->tg_load_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
+ cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
+ atomic_read(&cfs_rq->tg->runnable_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg",
+ atomic_read(&cfs_rq->tg->usage_avg));
#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b99..8046acce74d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -28,9 +28,20 @@
#include <linux/interrupt.h>
#include <trace/events/sched.h>
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq header to add a notifier so that cpu frequency
+ * scaling can track the current CPU frequency
+ */
+#include <linux/cpufreq.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
#include "sched.h"
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -157,6 +168,76 @@ void sched_init_granularity(void)
update_sysctl();
}
+
+/*
+ * Save the id of the optimal CPU that should be used to pack small tasks
+ * The value -1 is used when no buddy has been found
+ */
+DEFINE_PER_CPU(int, sd_pack_buddy);
+
+/* Look for the best buddy CPU that can be used to pack small tasks
+ * We make the assumption that it doesn't wort to pack on CPU that share the
+ * same powerline. We looks for the 1st sched_domain without the
+ * SD_SHARE_POWERLINE flag. Then We look for the sched_group witht the lowest
+ * power per core based on the assumption that their power efficiency is
+ * better */
+void update_packing_domain(int cpu)
+{
+ struct sched_domain *sd;
+ int id = -1;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_POWERLINE);
+ if (!sd)
+ sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+ else
+ if (cpumask_first(sched_domain_span(sd)) == cpu || !sd->parent)
+ sd = sd->parent;
+
+ while (sd) {
+ struct sched_group *sg = sd->groups;
+ struct sched_group *pack = sg;
+ struct sched_group *tmp = sg->next;
+
+ /* 1st CPU of the sched domain is a good candidate */
+ if (id == -1)
+ id = cpumask_first(sched_domain_span(sd));
+
+ /* Find sched group of candidate */
+ tmp = sd->groups;
+ do {
+ if (cpumask_test_cpu(id, sched_group_cpus(tmp))) {
+ sg = tmp;
+ break;
+ }
+ } while (tmp = tmp->next, tmp != sd->groups);
+
+ pack = sg;
+ tmp = sg->next;
+
+ /* loop the sched groups to find the best one */
+ while (tmp != sg) {
+ if (tmp->sgp->power * sg->group_weight <
+ sg->sgp->power * tmp->group_weight)
+ pack = tmp;
+ tmp = tmp->next;
+ }
+
+ /* we have found a better group */
+ if (pack != sg)
+ id = cpumask_first(sched_group_cpus(pack));
+
+ /* Look for another CPU than itself */
+ if ((id != cpu)
+ || ((sd->parent) && !(sd->parent->flags && SD_LOAD_BALANCE)))
+ break;
+
+ sd = sd->parent;
+ }
+
+ pr_info("CPU%d packing on CPU%d\n", cpu, id);
+ per_cpu(sd_pack_buddy, cpu) = id;
+}
+
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
@@ -259,6 +340,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update);
+
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -278,6 +362,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
+ /* We should have no load, but we need to update last_decay. */
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -653,9 +739,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -675,10 +758,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -801,72 +880,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- int global_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long load_avg;
-
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
-
- if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
- }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
- u64 period = sysctl_sched_shares_window;
- u64 now, delta;
- unsigned long load = cfs_rq->load.weight;
-
- if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
- return;
-
- now = rq_of(cfs_rq)->clock_task;
- delta = now - cfs_rq->load_stamp;
-
- /* truncate load history at 4 idle periods */
- if (cfs_rq->load_stamp > cfs_rq->load_last &&
- now - cfs_rq->load_last > 4 * period) {
- cfs_rq->load_period = 0;
- cfs_rq->load_avg = 0;
- delta = period - 1;
- }
-
- cfs_rq->load_stamp = now;
- cfs_rq->load_unacc_exec_time = 0;
- cfs_rq->load_period += delta;
- if (load) {
- cfs_rq->load_last = now;
- cfs_rq->load_avg += delta * load;
- }
-
- /* consider updating load contribution on each fold or truncate */
- if (global_update || cfs_rq->load_period > period
- || !cfs_rq->load_period)
- update_cfs_rq_load_contribution(cfs_rq, global_update);
-
- while (cfs_rq->load_period > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (cfs_rq->load_period));
- cfs_rq->load_period /= 2;
- cfs_rq->load_avg /= 2;
- }
-
- if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- list_del_leaf_cfs_rq(cfs_rq);
-}
-
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
@@ -876,8 +890,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic_read(&tg->load_weight);
- tg_weight -= cfs_rq->load_contribution;
+ tg_weight = atomic64_read(&tg->load_avg);
+ tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -901,27 +915,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return shares;
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
- }
-}
# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
@@ -939,6 +937,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
@@ -958,18 +958,653 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+#ifdef CONFIG_SMP
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
{
+ unsigned int local_n;
+
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val *= runnable_avg_yN_inv[local_n];
+ /* We don't use SRR here since we always want to round down. */
+ return val >> 32;
}
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *a, struct attribute *b,
+ const char *c, size_t count);
+ int *value;
+ int (*to_sysfs)(int);
+ int (*from_sysfs)(int);
+};
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+#define HMP_DATA_SYSFS_MAX 4
+#else
+#define HMP_DATA_SYSFS_MAX 3
+#endif
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ int freqinvar_load_scale_enabled;
+#endif
+ int multiplier; /* used to scale the time delta */
+ struct attribute_group attr_group;
+ struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+ struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data;
+
+static u64 hmp_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+ u32 curr_scale;
+ u32 min;
+ u32 max;
+ u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
+/* We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable,
+ int running,
+ int cpu)
+{
+ u64 delta, periods;
+ u32 runnable_contrib;
+ int delta_w, decayed = 0;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ u64 scaled_delta;
+ u32 scaled_runnable_contrib;
+ int scaled_delta_w;
+ u32 curr_scale = 1024;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
+ delta = now - sa->last_runnable_update;
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+ delta = hmp_variable_scale_convert(delta);
+#endif
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* retrieve scale factor for load */
+ if (hmp_data.freqinvar_load_scale_enabled)
+ curr_scale = freq_scale[cpu].curr_scale;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ /* scale runnable time if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta_w = (delta_w * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta_w;
+ if (running)
+ sa->usage_avg_sum += scaled_delta_w;
+#else
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ if (running)
+ sa->usage_avg_sum += delta_w;
+#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+ /* decay the load we have accumulated so far */
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+ sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
+ /* add the contribution from this period */
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ /* Apply load scaling if necessary.
+ * Note that multiplying the whole series is same as
+ * multiplying all terms
+ */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_runnable_contrib = (runnable_contrib * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += scaled_runnable_contrib;
+#else
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += runnable_contrib;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+ sa->runnable_avg_period += runnable_contrib;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ /* scale if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta;
+ if (running)
+ sa->usage_avg_sum += scaled_delta;
+#else
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ if (running)
+ sa->usage_avg_sum += delta;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+ sa->runnable_avg_period += delta;
+
+ return decayed;
+}
+
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+ decays -= se->avg.decay_count;
+ if (!decays)
+ return 0;
+
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.decay_count = 0;
+
+ return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ s64 tg_contrib;
+
+ tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ tg_contrib -= cfs_rq->tg_load_contrib;
+
+ if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic64_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_contrib += tg_contrib;
+ }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long contrib, usage_contrib;
+
+ /* The fraction of a cpu used by this cfs_rq */
+ contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ contrib -= cfs_rq->tg_runnable_contrib;
+
+ usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ usage_contrib -= cfs_rq->tg_usage_contrib;
+
+ /*
+ * contrib/usage at this point represent deltas, only update if they
+ * are substantive.
+ */
+ if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+ (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
+ atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_contrib += contrib;
+
+ atomic_add(usage_contrib, &tg->usage_avg);
+ cfs_rq->tg_usage_contrib += usage_contrib;
+ }
+}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ struct task_group *tg = cfs_rq->tg;
+ int runnable_avg;
+
+ u64 contrib;
+
+ contrib = cfs_rq->tg_load_contrib * tg->shares;
+ se->avg.load_avg_contrib = div64_u64(contrib,
+ atomic64_read(&tg->load_avg) + 1);
+
+ /*
+ * For group entities we need to compute a correction term in the case
+ * that they are consuming <1 cpu so that we would contribute the same
+ * load as a task of equal weight.
+ *
+ * Explicitly co-ordinating this measurement would be expensive, but
+ * fortunately the sum of each cpus contribution forms a usable
+ * lower-bound on the true value.
+ *
+ * Consider the aggregate of 2 contributions. Either they are disjoint
+ * (and the sum represents true value) or they are disjoint and we are
+ * understating by the aggregate of their overlap.
+ *
+ * Extending this to N cpus, for a given overlap, the maximum amount we
+ * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+ * cpus that overlap for this interval and w_i is the interval width.
+ *
+ * On a small machine; the first term is well-bounded which bounds the
+ * total error since w_i is a subset of the period. Whereas on a
+ * larger machine, while this first term can be larger, if w_i is the
+ * of consequential size guaranteed to see n_i*w_i quickly converge to
+ * our upper bound of 1-cpu.
+ */
+ runnable_avg = atomic_read(&tg->runnable_avg);
+ if (runnable_avg < NICE_0_LOAD) {
+ se->avg.load_avg_contrib *= runnable_avg;
+ se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+ }
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+#endif
+
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
+ contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_ratio = scale_load(contrib);
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.load_avg_contrib;
+
+ if (entity_is_task(se)) {
+ __update_task_entity_contrib(se);
+ } else {
+ __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ __update_group_entity_contrib(se);
+ }
+
+ return se->avg.load_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ long load_contrib)
+{
+ if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ cfs_rq->blocked_load_avg -= load_contrib;
+ else
+ cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ long contrib_delta;
+ u64 now;
+ int cpu = -1; /* not used in normal case */
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ cpu = cfs_rq->rq->cpu;
+#endif
+ /*
+ * For a group entity we need to use their owned cfs_rq_clock_task() in
+ * case they are the parent of a throttled hierarchy.
+ */
+ if (entity_is_task(se))
+ now = cfs_rq_clock_task(cfs_rq);
+ else
+ now = cfs_rq_clock_task(group_cfs_rq(se));
+
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+ cfs_rq->curr == se, cpu))
+ return;
+
+ contrib_delta = __update_entity_load_avg_contrib(se);
+
+ if (!update_cfs_rq)
+ return;
+
+ if (se->on_rq)
+ cfs_rq->runnable_load_avg += contrib_delta;
+ else
+ subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ u64 decays;
+
+ decays = now - cfs_rq->last_decay;
+ if (!decays && !force_update)
+ return;
+
+ if (atomic64_read(&cfs_rq->removed_load)) {
+ u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+ subtract_blocked_load_contrib(cfs_rq, removed_load);
+ }
+
+ if (decays) {
+ cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+ atomic64_add(decays, &cfs_rq->decay_counter);
+ cfs_rq->last_decay = now;
+ }
+
+ __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ update_cfs_shares(cfs_rq);
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ u32 contrib;
+ int cpu = -1; /* not used in normal case */
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ cpu = rq->cpu;
+#endif
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+ runnable, cpu);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+ contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
+ contrib /= (rq->avg.runnable_avg_period + 1);
+ trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
+ trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup)
+{
+ /*
+ * We track migrations using entity decay_count <= 0, on a wake-up
+ * migration we use a negative decay count to track the remote decays
+ * accumulated while sleeping.
+ */
+ if (unlikely(se->avg.decay_count <= 0)) {
+ se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+ if (se->avg.decay_count) {
+ /*
+ * In a wake-up migration we have to approximate the
+ * time sleeping. This is because we can't synchronize
+ * clock_task between the two cpus, and it is not
+ * guaranteed to be read-safe. Instead, we can
+ * approximate this using our carried decays, which are
+ * explicitly atomically readable.
+ */
+ se->avg.last_runnable_update -= (-se->avg.decay_count)
+ << 20;
+ update_entity_load_avg(se, 0);
+ /* Indicate that we're now synchronized and on-rq */
+ se->avg.decay_count = 0;
+ }
+ wakeup = 0;
+ } else {
+ __synchronize_entity_decay(se);
+ }
+
+ /* migrated tasks did not contribute to our blocked load */
+ if (wakeup) {
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ update_entity_load_avg(se, 0);
+ }
+
+ cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep)
+{
+ update_entity_load_avg(se, 1);
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+ cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ if (sleep) {
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update) {}
+#endif
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -1096,9 +1731,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- update_cfs_load(cfs_rq, 0);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -1190,9 +1824,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
- se->on_rq = 0;
- update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1206,7 +1839,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ se->on_rq = 0;
}
/*
@@ -1261,6 +1894,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -1340,6 +1974,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev, 1);
}
cfs_rq->curr = NULL;
}
@@ -1353,9 +1989,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);
/*
- * Update share accounting for long-running entities.
+ * Ensure that runnable average is periodically updated.
*/
- update_entity_shares_tick(cfs_rq);
+ update_entity_load_avg(curr, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1448,6 +2085,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task;
+
+ return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1592,14 +2238,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
- u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
- /* leaving throttled state, advance shares averaging windows */
- cfs_rq->load_stamp += delta;
- cfs_rq->load_last += delta;
-
- /* update entity weight now that we are on_rq again */
- update_cfs_shares(cfs_rq);
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq->clock_task -
+ cfs_rq->throttled_clock_task;
}
#endif
@@ -1611,9 +2252,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, record last load */
+ /* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
- update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttled_clock_task = rq->clock_task;
cfs_rq->throttle_count++;
return 0;
@@ -1628,7 +2269,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- /* account load preceding throttle */
+ /* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
@@ -1652,7 +2293,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rq->nr_running -= task_delta;
cfs_rq->throttled = 1;
- cfs_rq->throttled_timestamp = rq->clock;
+ cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2311,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
- cfs_rq->throttled_timestamp = 0;
update_rq_clock(rq);
/* update hierarchical throttle state */
@@ -2073,8 +2713,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_of(cfs_rq)->clock_task;
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2852,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
- if (!se)
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
+ }
hrtick_update(rq);
}
@@ -2266,12 +2913,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
- if (!se)
+ if (!se) {
dec_nr_running(rq);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2681,6 +3330,362 @@ done:
return target;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * The cpu types are distinguished using a list of hmp_domains
+ * which each represent one cpu type using a cpumask.
+ * The list is assumed ordered by compute capacity with the
+ * fastest domain first.
+ */
+DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+
+extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+
+/* Setup hmp_domains */
+static int __init hmp_cpu_mask_setup(void)
+{
+ char buf[64];
+ struct hmp_domain *domain;
+ struct list_head *pos;
+ int dc, cpu;
+
+ pr_debug("Initializing HMP scheduler:\n");
+
+ /* Initialize hmp_domains using platform code */
+ arch_get_hmp_domains(&hmp_domains);
+ if (list_empty(&hmp_domains)) {
+ pr_debug("HMP domain list is empty!\n");
+ return 0;
+ }
+
+ /* Print hmp_domains */
+ dc = 0;
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ cpulist_scnprintf(buf, 64, &domain->cpus);
+ pr_debug(" HMP domain %d: %s\n", dc, buf);
+
+ for_each_cpu_mask(cpu, domain->cpus) {
+ per_cpu(hmp_cpu_domain, cpu) = domain;
+ }
+ dc++;
+ }
+
+ return 1;
+}
+
+/*
+ * Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
+ * The default values (512, 256) offer good responsiveness, but may need
+ * tweaking suit particular needs.
+ *
+ * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
+ * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
+ * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
+ */
+unsigned int hmp_up_threshold = 512;
+unsigned int hmp_down_threshold = 256;
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
+#endif
+unsigned int hmp_next_up_threshold = 4096;
+unsigned int hmp_next_down_threshold = 4096;
+
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+
+/* Check if cpu is in fastest hmp_domain */
+static inline unsigned int hmp_cpu_is_fastest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return pos == hmp_domains.next;
+}
+
+/* Check if cpu is in slowest hmp_domain */
+static inline unsigned int hmp_cpu_is_slowest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_is_last(pos, &hmp_domains);
+}
+
+/* Next (slower) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_slower_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->next, struct hmp_domain, hmp_domains);
+}
+
+/* Previous (faster) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_faster_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->prev, struct hmp_domain, hmp_domains);
+}
+
+/*
+ * Selects a cpu in previous (faster) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ return cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
+ tsk_cpus_allowed(tsk));
+}
+
+/*
+ * Selects a cpu in next (slower) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ return cpumask_any_and(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(tsk));
+}
+
+static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+
+ se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq);
+ se->avg.hmp_last_down_migration = 0;
+}
+
+static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+
+ se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq);
+ se->avg.hmp_last_up_migration = 0;
+}
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * These functions allow to change the growing speed of the load_avg_ratio
+ * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
+ * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
+ *
+ * These functions also allow to change the up and down threshold of HMP
+ * using /sys/kernel/hmp/{up,down}_threshold.
+ * Both must be between 0 and 1023. The threshold that is compared
+ * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
+ *
+ * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
+ * task with a load of 0 will reach the threshold after 64ms of busy loop.
+ *
+ * Changing load_avg_periods_ms has the same effect than changing the
+ * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
+ * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
+ * could trigger overflows.
+ * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
+ * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
+ * could be overflowed for a weight > 2^12 even is the load_avg_contrib
+ * should still be a 32bits result. This would not happen by multiplicating
+ * delta time by 1/22 and setting load_avg_period_ms = 706.
+ */
+
+/*
+ * By scaling the delta time it end-up increasing or decrease the
+ * growing speed of the per entity load_avg_ratio
+ * The scale factor hmp_data.multiplier is a fixed point
+ * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
+ */
+static u64 hmp_variable_scale_convert(u64 delta)
+{
+ u64 high = delta >> 32ULL;
+ u64 low = delta & 0xffffffffULL;
+ low *= hmp_data.multiplier;
+ high *= hmp_data.multiplier;
+ return (low >> HMP_VARIABLE_SCALE_SHIFT)
+ + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
+}
+
+static ssize_t hmp_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ ssize_t ret = 0;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ int temp = *(hmp_attr->value);
+ if (hmp_attr->to_sysfs != NULL)
+ temp = hmp_attr->to_sysfs(temp);
+ ret = sprintf(buf, "%d\n", temp);
+ return ret;
+}
+
+static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ int temp;
+ ssize_t ret = count;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ char *str = vmalloc(count + 1);
+ if (str == NULL)
+ return -ENOMEM;
+ memcpy(str, buf, count);
+ str[count] = 0;
+ if (sscanf(str, "%d", &temp) < 1)
+ ret = -EINVAL;
+ else {
+ if (hmp_attr->from_sysfs != NULL)
+ temp = hmp_attr->from_sysfs(temp);
+ if (temp < 0)
+ ret = -EINVAL;
+ else
+ *(hmp_attr->value) = temp;
+ }
+ vfree(str);
+ return ret;
+}
+
+static int hmp_period_tofrom_sysfs(int value)
+{
+ return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
+}
+
+/* max value for threshold is 1024 */
+static int hmp_theshold_from_sysfs(int value)
+{
+ if (value > 1024)
+ return -1;
+ return value;
+}
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* freqinvar control is only 0,1 off/on */
+static int hmp_freqinvar_from_sysfs(int value)
+{
+ if (value < 0 || value > 1)
+ return -1;
+ return value;
+}
+#endif
+static void hmp_attr_add(
+ const char *name,
+ int *value,
+ int (*to_sysfs)(int),
+ int (*from_sysfs)(int))
+{
+ int i = 0;
+ while (hmp_data.attributes[i] != NULL) {
+ i++;
+ if (i >= HMP_DATA_SYSFS_MAX)
+ return;
+ }
+ hmp_data.attr[i].attr.mode = 0644;
+ hmp_data.attr[i].show = hmp_show;
+ hmp_data.attr[i].store = hmp_store;
+ hmp_data.attr[i].attr.name = name;
+ hmp_data.attr[i].value = value;
+ hmp_data.attr[i].to_sysfs = to_sysfs;
+ hmp_data.attr[i].from_sysfs = from_sysfs;
+ hmp_data.attributes[i] = &hmp_data.attr[i].attr;
+ hmp_data.attributes[i + 1] = NULL;
+}
+
+static int hmp_attr_init(void)
+{
+ int ret;
+ memset(&hmp_data, sizeof(hmp_data), 0);
+ /* by default load_avg_period_ms == LOAD_AVG_PERIOD
+ * meaning no change
+ */
+ hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
+
+ hmp_attr_add("load_avg_period_ms",
+ &hmp_data.multiplier,
+ hmp_period_tofrom_sysfs,
+ hmp_period_tofrom_sysfs);
+ hmp_attr_add("up_threshold",
+ &hmp_up_threshold,
+ NULL,
+ hmp_theshold_from_sysfs);
+ hmp_attr_add("down_threshold",
+ &hmp_down_threshold,
+ NULL,
+ hmp_theshold_from_sysfs);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* default frequency-invariant scaling ON */
+ hmp_data.freqinvar_load_scale_enabled = 1;
+ hmp_attr_add("frequency_invariant_load_scale",
+ &hmp_data.freqinvar_load_scale_enabled,
+ NULL,
+ hmp_freqinvar_from_sysfs);
+#endif
+ hmp_data.attr_group.name = "hmp";
+ hmp_data.attr_group.attrs = hmp_data.attributes;
+ ret = sysfs_create_group(kernel_kobj,
+ &hmp_data.attr_group);
+ return 0;
+}
+late_initcall(hmp_attr_init);
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+#endif /* CONFIG_SCHED_HMP */
+
+static inline bool is_buddy_busy(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * A busy buddy is a CPU with a high load or a small load with a lot of
+ * running tasks.
+ */
+ return ((rq->avg.usage_avg_sum << rq->nr_running) >
+ rq->avg.runnable_avg_period);
+}
+
+static inline bool is_light_task(struct task_struct *p)
+{
+ /* A light task runs less than 25% in average */
+ return ((p->se.avg.usage_avg_sum << 2) < p->se.avg.runnable_avg_period);
+}
+
+static int check_pack_buddy(int cpu, struct task_struct *p)
+{
+ int buddy = per_cpu(sd_pack_buddy, cpu);
+
+ /* No pack buddy for this CPU */
+ if (buddy == -1)
+ return false;
+
+ /*
+ * If a task is waiting for running on the CPU which is its own buddy,
+ * let the default behavior to look for a better CPU if available
+ * The threshold has been set to 37.5%
+ */
+ if ((buddy == cpu)
+ && ((p->se.avg.usage_avg_sum << 3) < (p->se.avg.runnable_avg_sum * 5)))
+ return false;
+
+ /* buddy is not an allowed CPU */
+ if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p)))
+ return false;
+
+ /*
+ * If the task is a small one and the buddy is not overloaded,
+ * we use buddy cpu
+ */
+ if (!is_light_task(p) || is_buddy_busy(buddy))
+ return false;
+
+ return true;
+}
+
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -2705,6 +3710,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
if (p->nr_cpus_allowed == 1)
return prev_cpu;
+ if (check_pack_buddy(cpu, p))
+ return per_cpu(sd_pack_buddy, cpu);
+
if (sd_flag & SD_BALANCE_WAKE) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
want_affine = 1;
@@ -2779,8 +3787,50 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
unlock:
rcu_read_unlock();
+#ifdef CONFIG_SCHED_HMP
+ if (hmp_up_migration(prev_cpu, &p->se)) {
+ new_cpu = hmp_select_faster_cpu(p, prev_cpu);
+ hmp_next_up_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, 0);
+ return new_cpu;
+ }
+ if (hmp_down_migration(prev_cpu, &p->se)) {
+ new_cpu = hmp_select_slower_cpu(p, prev_cpu);
+ hmp_next_down_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, 0);
+ return new_cpu;
+ }
+ /* Make sure that the task stays in its previous hmp domain */
+ if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
+ return prev_cpu;
+#endif
+
return new_cpu;
}
+
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Load tracking: accumulate removed load so that it can be processed
+ * when we next update owning cfs_rq under rq->lock. Tasks contribute
+ * to blocked load iff they have a positive decay-count. It can never
+ * be negative here since on-rq tasks have decay-count == 0.
+ */
+ if (se->avg.decay_count) {
+ se->avg.decay_count = -__synchronize_entity_decay(se);
+ atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+ }
+}
#endif /* CONFIG_SMP */
static unsigned long
@@ -3033,8 +4083,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
#ifdef CONFIG_SMP
/**************************************************
- * Fair scheduling class load-balancing methods:
- */
+ * Fair scheduling class load-balancing methods.
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ * W_i,n/P_i == W_j,n/P_j for all i,j (1)
+ *
+ * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ * W_i,0 = \Sum_j w_i,j (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * is derived from the nice value as per prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
+ *
+ * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ * - infeasible weights;
+ * - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of cpus that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of cpus in
+ * the groups.
+ *
+ * This yields:
+ *
+ * log_2 n 1 n
+ * \Sum { --- * --- * 2^i } = O(n) (5)
+ * i = 0 2^i 2^i
+ * `- size of each group
+ * | | `- number of cpus doing load-balance
+ * | `- freq
+ * `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other cpu in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ * log_2 n
+ * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
+ * k = 0
+ *
+ * And you'll find that:
+ *
+ * A^(log_2 n)_i,j != 0 for all i,j (7)
+ *
+ * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ * O(nm log n), n := nr_cpus, m := nr_tasks (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ * s_k,i
+ * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
+ * S_k
+ *
+ * Where
+ *
+ * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ * rewrite all of this once again.]
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -3160,7 +4324,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
-
tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -3300,52 +4463,58 @@ next:
/*
* update tg->load_weight by folding this cpu's load_avg
*/
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
{
- struct cfs_rq *cfs_rq;
- unsigned long flags;
- struct rq *rq;
-
- if (!tg->se[cpu])
- return 0;
-
- rq = cpu_rq(cpu);
- cfs_rq = tg->cfs_rq[cpu];
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- update_rq_clock(rq);
- update_cfs_load(cfs_rq, 1);
+ struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
- /*
- * We need to update shares after updating tg->load_weight in
- * order to adjust the weight of groups with long running tasks.
- */
- update_cfs_shares(cfs_rq);
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ return;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
- return 0;
+ if (se) {
+ update_entity_load_avg(se, 1);
+ /*
+ * We pivot on our runnable average having decayed to zero for
+ * list removal. This generally implies that all our children
+ * have also been removed (modulo rounding error or bandwidth
+ * control); however, such cases are rare and we can fix these
+ * at enqueue.
+ *
+ * TODO: fix up out-of-order children on enqueue.
+ */
+ if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+ list_del_leaf_cfs_rq(cfs_rq);
+ } else {
+ struct rq *rq = rq_of(cfs_rq);
+ update_rq_runnable_avg(rq, rq->nr_running);
+ }
}
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
{
- struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
- rcu_read_lock();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- update_shares_cpu(cfs_rq->tg, cpu);
+ /*
+ * Note: We may want to consider periodically releasing
+ * rq->lock about these updates so that creating many task
+ * groups does not result in continually extending hold time.
+ */
+ __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
}
- rcu_read_unlock();
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
/*
@@ -3397,7 +4566,7 @@ static unsigned long task_h_load(struct task_struct *p)
return load;
}
#else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
{
}
@@ -4457,12 +5626,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
+ update_rq_runnable_avg(this_rq, 1);
+
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
raw_spin_unlock(&this_rq->lock);
- update_shares(this_cpu);
+ update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -4581,7 +5752,25 @@ static struct {
static inline int find_new_ilb(int call_cpu)
{
+ struct sched_domain *sd;
int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int buddy = per_cpu(sd_pack_buddy, call_cpu);
+
+ /*
+ * If we have a pack buddy CPU, we try to run load balance on a CPU
+ * that is close to the buddy.
+ */
+ if (buddy != -1)
+ for_each_domain(buddy, sd) {
+ if (sd->flags & SD_SHARE_CPUPOWER)
+ continue;
+
+ ilb = cpumask_first_and(sched_domain_span(sd),
+ nohz.idle_cpus_mask);
+
+ if (ilb < nr_cpu_ids)
+ break;
+ }
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
@@ -4717,7 +5906,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
int update_next_balance = 0;
int need_serialize;
- update_shares(cpu);
+ update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -4886,6 +6075,267 @@ need_kick:
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
#endif
+#ifdef CONFIG_SCHED_HMP
+/* Check if task should migrate to a faster cpu */
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ u64 now;
+
+ if (hmp_cpu_is_fastest(cpu))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+ /* Filter by task priority */
+ if (p->prio >= hmp_up_prio)
+ return 0;
+#endif
+
+ /* Let the task load settle before doing another up migration */
+ now = cfs_rq_clock_task(cfs_rq);
+ if (((now - se->avg.hmp_last_up_migration) >> 10)
+ < hmp_next_up_threshold)
+ return 0;
+
+ if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))
+ && se->avg.load_avg_ratio > hmp_up_threshold) {
+ return 1;
+ }
+ return 0;
+}
+
+/* Check if task should migrate to a slower cpu */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ u64 now;
+
+ if (hmp_cpu_is_slowest(cpu))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+ /* Filter by task priority */
+ if ((p->prio >= hmp_up_prio) &&
+ cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))) {
+ return 1;
+ }
+#endif
+
+ /* Let the task load settle before doing another down migration */
+ now = cfs_rq_clock_task(cfs_rq);
+ if (((now - se->avg.hmp_last_down_migration) >> 10)
+ < hmp_next_down_threshold)
+ return 0;
+
+ if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))
+ && se->avg.load_avg_ratio < hmp_down_threshold) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot = 0;
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ return 0;
+ }
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
+ return 1;
+}
+
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+ struct task_struct *p, *n;
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+ env->dst_cpu))
+ continue;
+
+ if (!hmp_can_migrate_task(p, env))
+ continue;
+ /* Check if we found the right task */
+ if (p != pm)
+ continue;
+
+ move_task(p, env);
+ /*
+ * Right now, this is only the third place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance)) {
+ goto out_unlock;
+ }
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq)
+ goto out_unlock;
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ if (move_specific_task(&env, p))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ rcu_read_unlock();
+ double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/*
+ * hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu.
+ */
+static void hmp_force_up_migration(int this_cpu)
+{
+ int cpu;
+ struct sched_entity *curr;
+ struct rq *target;
+ unsigned long flags;
+ unsigned int force;
+ struct task_struct *p;
+
+ if (!spin_trylock(&hmp_force_migration))
+ return;
+ for_each_online_cpu(cpu) {
+ force = 0;
+ target = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&target->lock, flags);
+ curr = target->cfs.curr;
+ if (!curr || !entity_is_task(curr)) {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ continue;
+ }
+ p = task_of(curr);
+ if (hmp_up_migration(cpu, curr)) {
+ if (!target->active_balance) {
+ target->active_balance = 1;
+ target->push_cpu = hmp_select_faster_cpu(p, cpu);
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu, 1);
+ hmp_next_up_delay(&p->se, target->push_cpu);
+ }
+ }
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ if (force)
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_active_task_migration_cpu_stop,
+ target, &target->active_balance_work);
+ }
+ spin_unlock(&hmp_force_migration);
+}
+#else
+static void hmp_force_up_migration(int this_cpu) { }
+#endif /* CONFIG_SCHED_HMP */
+
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -4897,6 +6347,8 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
+ hmp_force_up_migration(this_cpu);
+
rebalance_domains(this_cpu, idle);
/*
@@ -4954,6 +6406,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ update_rq_runnable_avg(rq, 1);
}
/*
@@ -5046,6 +6500,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ /*
+ * Remove our load from contribution when we leave sched_fair
+ * and ensure we don't carry in an old decay_count if we
+ * switch back.
+ */
+ if (p->se.avg.decay_count) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+ __synchronize_entity_decay(&p->se);
+ subtract_blocked_load_contrib(cfs_rq,
+ p->se.avg.load_avg_contrib);
+ }
+#endif
}
/*
@@ -5092,11 +6560,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ atomic64_set(&cfs_rq->decay_counter, 1);
+ atomic64_set(&cfs_rq->removed_load, 0);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct cfs_rq *cfs_rq;
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5128,8 +6601,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
if (!on_rq)
p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
set_task_rq(p, task_cpu(p));
- if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ if (!on_rq) {
+ cfs_rq = cfs_rq_of(&p->se);
+ p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+ /*
+ * migrate_task_rq_fair() will have removed our previous
+ * contribution, but we must synchronize for ongoing future
+ * decay.
+ */
+ p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+ }
}
void free_fair_sched_group(struct task_group *tg)
@@ -5214,10 +6698,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
cfs_rq->tg = tg;
cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
- /* allow initial update_cfs_load() to truncate */
- cfs_rq->load_stamp = 1;
-#endif
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
@@ -5264,8 +6744,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
update_cfs_shares(group_cfs_rq(se));
+ /* update contribution to parent */
+ update_entity_load_avg(se, 1);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -5319,6 +6802,7 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
+ .migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
@@ -5363,6 +6847,139 @@ __init void init_sched_fair_class(void)
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
+
+#ifdef CONFIG_SCHED_HMP
+ hmp_cpu_mask_setup();
+#endif
#endif /* SMP */
}
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+ u32 result = curr / max;
+ return result;
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu = freq->cpu;
+ struct cpufreq_extents *extents;
+
+ if (freq->flags & CPUFREQ_CONST_LOOPS)
+ return NOTIFY_OK;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return NOTIFY_OK;
+
+ /* if dynamic load scale is disabled, set the load scale to 1.0 */
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ freq_scale[cpu].curr_scale = 1024;
+ return NOTIFY_OK;
+ }
+
+ extents = &freq_scale[cpu];
+ if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+ /* If our governor was recognised as a single-freq governor,
+ * use 1.0
+ */
+ extents->curr_scale = 1024;
+ } else {
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, freq->new);
+ }
+
+ return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct cpufreq_policy *policy = data;
+ struct cpufreq_extents *extents;
+ int cpu, singleFreq = 0;
+ static const char performance_governor[] = "performance";
+ static const char powersave_governor[] = "powersave";
+
+ if (event == CPUFREQ_START)
+ return 0;
+
+ if (event != CPUFREQ_INCOMPATIBLE)
+ return 0;
+
+ /* CPUFreq governors do not accurately report the range of
+ * CPU Frequencies they will choose from.
+ * We recognise performance and powersave governors as
+ * single-frequency only.
+ */
+ if (!strncmp(policy->governor->name, performance_governor,
+ strlen(performance_governor)) ||
+ !strncmp(policy->governor->name, powersave_governor,
+ strlen(powersave_governor)))
+ singleFreq = 1;
+
+ /* Make sure that all CPUs impacted by this policy are
+ * updated since we will only get a notification when the
+ * user explicitly changes the policy on a CPU.
+ */
+ for_each_cpu(cpu, policy->cpus) {
+ extents = &freq_scale[cpu];
+ extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
+ extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ extents->curr_scale = 1024;
+ } else if (singleFreq) {
+ extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = 1024;
+ } else {
+ extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, policy->cur);
+ }
+ }
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+ .notifier_call = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+ .notifier_call = cpufreq_policy_callback,
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+ int ret = 0;
+
+ /* init safe defaults since there are no policies at registration */
+ for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+ /* safe defaults */
+ freq_scale[ret].max = 1024;
+ freq_scale[ret].min = 1024;
+ freq_scale[ret].curr_scale = 1024;
+ }
+
+ pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+ ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (ret != -EINVAL)
+ ret = cpufreq_register_notifier(&cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfab..b898762f5d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ struct task_group {
unsigned long shares;
atomic_t load_weight;
+ atomic64_t load_avg;
+ atomic_t runnable_avg, usage_avg;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +224,22 @@ struct cfs_rq {
unsigned int nr_spread_over;
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
-
+#ifdef CONFIG_SMP
/*
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- * (like users, containers etc.)
- *
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
+ * CFS Load tracking
+ * Under CFS, load is tracked on a per-entity basis and aggregated up.
+ * This allows for the description of both thread and group usage (in
+ * the FAIR_GROUP_SCHED case).
*/
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ u64 runnable_load_avg, blocked_load_avg;
+ atomic64_t decay_counter, removed_load;
+ u64 last_decay;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ u32 tg_runnable_contrib, tg_usage_contrib;
+ u64 tg_load_contrib;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
/*
* h_load = weight * f(tg)
*
@@ -245,26 +247,30 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
- * Maintaining per-cpu shares distribution for group scheduling
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
*
- * load_stamp is the last time we updated the load average
- * load_last is the last time we updated the load average and saw load
- * load_unacc_exec_time is currently unaccounted execution time
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
*/
- u64 load_avg;
- u64 load_period;
- u64 load_stamp, load_last, load_unacc_exec_time;
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
- unsigned long load_contribution;
-#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
- u64 throttled_timestamp;
+ u64 throttled_clock, throttled_clock_task;
+ u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -412,6 +418,9 @@ struct rq {
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
+#ifdef CONFIG_SCHED_HMP
+ struct task_struct *migrate_task;
+#endif
/* cpu of this runqueue: */
int cpu;
int online;
@@ -467,6 +476,8 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
+
+ struct sched_avg avg;
};
static inline int cpu_of(struct rq *rq)
@@ -532,6 +543,12 @@ DECLARE_PER_CPU(int, sd_llc_id);
extern int group_balance_cpu(struct sched_group *sg);
+#ifdef CONFIG_SCHED_HMP
+static LIST_HEAD(hmp_domains);
+DECLARE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+#define hmp_cpu_domain(cpu) (per_cpu(hmp_cpu_domain, (cpu)))
+#endif /* CONFIG_SCHED_HMP */
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -861,6 +878,7 @@ static inline void idle_balance(int cpu, struct rq *rq)
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
+extern void update_packing_domain(int cpu);
extern void update_max_interval(void);
extern void update_group_power(struct sched_domain *sd, int cpu);
extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
@@ -1212,4 +1230,3 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf
new file mode 100644
index 00000000000..8cc2be049a4
--- /dev/null
+++ b/linaro/configs/big-LITTLE-MP.conf
@@ -0,0 +1,13 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_NO_HZ=y
+CONFIG_SCHED_MC=y
+CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y
+CONFIG_SCHED_HMP=y
+CONFIG_HMP_FAST_CPU_MASK=""
+CONFIG_HMP_SLOW_CPU_MASK=""
+CONFIG_HMP_VARIABLE_SCALE=y
+CONFIG_HMP_FREQUENCY_INVARIANT_SCALE=y
+CONFIG_SCHED_HMP_PRIO_FILTER=y
+CONFIG_SCHED_HMP_PRIO_FILTER_VAL=5