aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/tune.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/tune.c')
-rw-r--r--kernel/sched/tune.c681
1 files changed, 681 insertions, 0 deletions
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100644
index 000000000000..2ec7e63524a5
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,681 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+#include "tune.h"
+
+unsigned int sysctl_sched_cfs_boost __read_mostly = 0;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+ int nrg_gain;
+ int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+ { 0, 4 }, /* >= 0% */
+ { 1, 4 }, /* >= 10% */
+ { 2, 4 }, /* >= 20% */
+ { 3, 4 }, /* >= 30% */
+ { 4, 4 }, /* >= 40% */
+ { 4, 3 }, /* >= 50% */
+ { 4, 2 }, /* >= 60% */
+ { 4, 1 }, /* >= 70% */
+ { 4, 0 }, /* >= 80% */
+ { 4, 0 } /* >= 90% */
+};
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+ unsigned long min_power;
+ unsigned long max_power;
+ unsigned long nrg_shift;
+ unsigned long nrg_mult;
+};
+
+/*
+ * Target specific system energy normalization constants
+ * NOTE: These values are specific for ARM TC2 and they are derived from the
+ * energy model data defined in: arch/arm/kernel/topology.c
+ */
+static struct target_nrg
+schedtune_target_nrg = {
+
+ /*
+ * TC2 Min CPUs power:
+ * all CPUs idle, all clusters in deep idle:
+ * 0 * 3 + 0 * 2 + 10 + 25
+ */
+ .min_power = 35,
+
+ /*
+ * TC2 Max CPUs power:
+ * all CPUs fully utilized while running at max OPP:
+ * 1024 * 3 + 6997 * 2 + 4905 + 15200
+ */
+
+ .max_power = 37171,
+
+ /*
+ * Fast integer division by constant:
+ * Constant : Max - Min (C) = 37171 - 35 = 37136
+ * Precision : 0.1% (P) = 0.1
+ * Reference : C * 100 / P (R) = 3713600
+ *
+ * Thus:
+ * Shift bifs : ceil(log(R,2)) (S) = 26
+ * Mult const : round(2^S/C) (M) = 1807
+ *
+ * This allows to compute the normalized energy:
+ * system_energy / C
+ * as:
+ * (system_energy * M) >> S
+ */
+ .nrg_shift = 26, /* S */
+ .nrg_mult = 1807, /* M */
+};
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+int
+schedtune_normalize_energy(int energy_diff)
+{
+ long long normalized_nrg = energy_diff;
+ int max_delta;
+
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg *= schedtune_target_nrg.nrg_mult;
+ normalized_nrg >>= schedtune_target_nrg.nrg_shift;
+
+ return normalized_nrg;
+}
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ int perf_boost_idx, int perf_constrain_idx) {
+ int energy_payoff;
+
+ /* Performance Boost (B) region */
+ if (nrg_delta > 0 && cap_delta > 0) {
+ /*
+ * energy_payoff criteria:
+ * cap_delta / nrg_delta > cap_gain / nrg_gain
+ * which is:
+ * nrg_delta * cap_gain < cap_delta * nrg_gain
+ */
+ energy_payoff = cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
+ energy_payoff -= nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
+
+ trace_sched_tune_filter(
+ threshold_gains[perf_boost_idx].nrg_gain,
+ threshold_gains[perf_boost_idx].cap_gain,
+ energy_payoff, 8);
+
+ return energy_payoff;
+ }
+
+ /* Performance Constraint (C) region */
+ if (nrg_delta < 0 && cap_delta < 0) {
+ /*
+ * energy_payoff criteria:
+ * cap_delta / nrg_delta < cap_gain / nrg_gain
+ * which is:
+ * nrg_delta * cap_gain > cap_delta * nrg_gain
+ */
+ energy_payoff = nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
+ energy_payoff -= cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
+
+ trace_sched_tune_filter(
+ threshold_gains[perf_constrain_idx].nrg_gain,
+ threshold_gains[perf_constrain_idx].cap_gain,
+ energy_payoff, 6);
+
+ return energy_payoff;
+ }
+
+ /* Default: reject schedule candidate */
+ return -INT_MAX;
+}
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+/*
+ * EAS scheduler tunables for task groups.
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+ /* SchedTune CGroup subsystem */
+ struct cgroup_subsys_state css;
+
+ /* Boost group allocated ID */
+ int idx;
+
+ /* Boost value for tasks on that SchedTune CGroup */
+ int boost;
+
+ /* Performance Boost (B) region threshold params */
+ int perf_boost_idx;
+
+ /* Performance Constraint (C) region threshold params */
+ int perf_constrain_idx;
+
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+ return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+ return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boosting tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to all tasks not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+ .boost = 0,
+ .perf_boost_idx = 0,
+ .perf_constrain_idx = 0,
+};
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task) {
+ struct schedtune *ct;
+ int perf_boost_idx;
+ int perf_constrain_idx;
+
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ /* Get task specific perf Boost/Constraints indexes */
+ rcu_read_lock();
+ ct = task_schedtune(task);
+ perf_boost_idx = ct->perf_boost_idx;
+ perf_constrain_idx = ct->perf_constrain_idx;
+ rcu_read_unlock();
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+
+}
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allows only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ * make sense to boost with different values (e.g. backgroud vs foreground
+ * tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ * implementation especially for the computation of the per-CPU boost
+ * value
+ */
+#define BOOSTGROUPS_COUNT 16
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+ &root_schedtune,
+ NULL,
+};
+
+/* SchedTune boost groups
+ * Each CPU in the system could be affected by multiple boost groups, for
+ * example when a CPU has two RUNNABLE tasks beloging to two different boost
+ * groups and thus likely with different boost values.
+ * This data structure keep track of all the boost groups which could impact
+ * on a CPU.
+ * Since on each system we expect only a limited number of boost
+ * groups, here we use a simple array to keep track of the metrics required to
+ * compute the maximum per-CPU boosting value.
+ */
+struct boost_groups {
+ /* Maximum boost value for all RUNNABLE tasks on a CPU */
+ unsigned boost_max;
+ struct {
+ /* The boost for tasks on that boost group */
+ unsigned boost;
+ /* Count of RUNNABLE tasks on that boost group */
+ unsigned tasks;
+ } group[BOOSTGROUPS_COUNT];
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static void
+schedtune_cpu_update(int cpu)
+{
+ struct boost_groups *bg;
+ unsigned boost_max;
+ int idx;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* The root boost group is always active */
+ boost_max = bg->group[0].boost;
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+ /*
+ * A boost group affects a CPU only if it has
+ * RUNNABLE tasks on that CPU
+ */
+ if (bg->group[idx].tasks == 0)
+ continue;
+ boost_max = max(boost_max, bg->group[idx].boost);
+ }
+
+ bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+ struct boost_groups *bg;
+ int cur_boost_max;
+ int old_boost;
+ int cpu;
+
+ /* Update per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /*
+ * Keep track of current boost values to compute the per CPU
+ * maximum only when it has been affected by the new value of
+ * the updated boost group
+ */
+ cur_boost_max = bg->boost_max;
+ old_boost = bg->group[idx].boost;
+
+ /* Update the boost value of this boost group */
+ bg->group[idx].boost = boost;
+
+ /* Check if this update increase current max */
+ if (boost > cur_boost_max && bg->group[idx].tasks) {
+ bg->boost_max = boost;
+ trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
+ continue;
+ }
+
+ /* Check if this update has decreased current max */
+ if (cur_boost_max == old_boost && old_boost > boost) {
+ schedtune_cpu_update(cpu);
+ trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+ continue;
+ }
+
+ trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
+ }
+
+ return 0;
+}
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+ struct boost_groups *bg;
+ int tasks;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* Update boosted tasks count while avoiding to make it negative */
+ if (task_count < 0 && bg->group[idx].tasks <= -task_count)
+ bg->group[idx].tasks = 0;
+ else
+ bg->group[idx].tasks += task_count;
+
+ /* Boost group activation or deactivation on that RQ */
+ tasks = bg->group[idx].tasks;
+ if (tasks == 1 || tasks == 0)
+ schedtune_cpu_update(cpu);
+
+ trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+ bg->group[idx].boost, bg->boost_max);
+
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+ struct schedtune *st;
+ int idx;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /* Get task boost group */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ idx = st->idx;
+ rcu_read_unlock();
+
+ schedtune_tasks_update(p, cpu, idx, 1);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+ struct schedtune *st;
+ int idx;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ * The last dequeue will be done by cgroup exit() callback.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /* Get task boost group */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ idx = st->idx;
+ rcu_read_unlock();
+
+ schedtune_tasks_update(p, cpu, idx, -1);
+}
+
+int schedtune_taskgroup_boost(struct task_struct *p)
+{
+ struct schedtune *ct;
+ int task_boost;
+
+ rcu_read_lock();
+ ct = task_schedtune(p);
+ task_boost = ct->boost;
+ rcu_read_unlock();
+
+ return task_boost;
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+ struct boost_groups *bg;
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ return bg->boost_max;
+}
+
+static u64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+ return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 boost)
+{
+ struct schedtune *st = css_st(css);
+ int err = 0;
+
+ if (boost < 0 || boost > 100) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ st->boost = boost;
+ if (css == &root_schedtune.css)
+ sysctl_sched_cfs_boost = boost;
+
+ if (boost == 100)
+ boost = 99;
+
+ /* Performance Boost (B) region threshold params */
+ st->perf_boost_idx = boost;
+ st->perf_boost_idx /= 10;
+
+ /* Performance Constraint (C) region threshold params */
+ st->perf_constrain_idx = 100 - boost;
+ st->perf_constrain_idx /= 10;
+
+ /* Update CPU boost */
+ schedtune_boostgroup_update(st->idx, st->boost);
+
+ trace_sched_tune_config(st->boost,
+ threshold_gains[st->perf_boost_idx].nrg_gain,
+ threshold_gains[st->perf_boost_idx].cap_gain,
+ threshold_gains[st->perf_constrain_idx].nrg_gain,
+ threshold_gains[st->perf_constrain_idx].cap_gain);
+
+out:
+ return err;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "boost",
+ .read_u64 = boost_read,
+ .write_u64 = boost_write,
+ },
+ { } /* terminate */
+};
+
+static int
+schedtune_boostgroup_init(struct schedtune *st)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Keep track of allocated boost group */
+ allocated_group[st->idx] = st;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ bg->group[st->idx].boost = 0;
+ bg->group[st->idx].tasks = 0;
+ }
+
+ return 0;
+}
+
+static int
+schedtune_init(void)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ memset(bg, 0 , sizeof(struct boost_groups));
+ }
+
+ pr_info(" schedtune configured to support %d boost groups\n",
+ BOOSTGROUPS_COUNT);
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct schedtune *st;
+ int idx;
+
+ if (!parent_css) {
+ schedtune_init();
+ return &root_schedtune.css;
+ }
+
+ /* Allows only single level hierachies */
+ if (parent_css != &root_schedtune.css) {
+ pr_err("Nested SchedTune boosting groups not allowed\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Allows only a limited number of boosting groups */
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+ if (allocated_group[idx] == NULL)
+ break;
+ if (idx == BOOSTGROUPS_COUNT) {
+ pr_err("Trying to create more than %d SchedTune boosting groups\n",
+ BOOSTGROUPS_COUNT);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ st = kzalloc(sizeof(*st), GFP_KERNEL);
+ if (!st)
+ goto out;
+
+ /* Initialize per CPUs boost group support */
+ st->idx = idx;
+ if (schedtune_boostgroup_init(st))
+ goto release;
+
+ return &st->css;
+
+release:
+ kfree(st);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+ /* Reset this group boost */
+ schedtune_boostgroup_update(st->idx, 0);
+
+ /* Keep track of allocated boost group */
+ allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+ struct schedtune *st = css_st(css);
+ schedtune_boostgroup_release(st);
+ kfree(st);
+}
+
+static void
+schedtune_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *tsk)
+{
+ struct schedtune *old_st = css_st(old_css);
+ int cpu = task_cpu(tsk);
+
+ schedtune_tasks_update(tsk, cpu, old_st->idx, -1);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+ .css_alloc = schedtune_css_alloc,
+ .css_free = schedtune_css_free,
+ .exit = schedtune_exit,
+ .legacy_cftypes = files,
+ .early_init = 1,
+};
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta) {
+
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_printk("schedtune_filter: region=O ngain=0 pgain=0 nrg_payoff=-1");
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_printk("schedtune_filter: region=S ngain=0 pgain=0 nrg_payoff=1");
+ return -INT_MAX;
+ }
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+
+}
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ /* Performance Boost (B) region threshold params */
+ perf_boost_idx = sysctl_sched_cfs_boost;
+ perf_boost_idx /= 10;
+
+ /* Performance Constraint (C) region threshold params */
+ perf_constrain_idx = 100 - sysctl_sched_cfs_boost;
+ perf_constrain_idx /= 10;
+
+ return 0;
+}
+