1 files changed, 681 insertions, 0 deletions
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100644
index 000000000000..2ec7e63524a5
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,681 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+#include "tune.h"
+
+unsigned int sysctl_sched_cfs_boost __read_mostly = 0;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+	int nrg_gain;
+	int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+	{ 0, 4 }, /* >=  0% */
+	{ 1, 4 }, /* >= 10% */
+	{ 2, 4 }, /* >= 20% */
+	{ 3, 4 }, /* >= 30% */
+	{ 4, 4 }, /* >= 40% */
+	{ 4, 3 }, /* >= 50% */
+	{ 4, 2 }, /* >= 60% */
+	{ 4, 1 }, /* >= 70% */
+	{ 4, 0 }, /* >= 80% */
+	{ 4, 0 }  /* >= 90% */
+};
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+	unsigned long min_power;
+	unsigned long max_power;
+	unsigned long nrg_shift;
+	unsigned long nrg_mult;
+};
+
+/*
+ * Target specific system energy normalization constants
+ * NOTE: These values are specific for ARM TC2 and they are derived from the
+ *       energy model data defined in: arch/arm/kernel/topology.c
+ */
+static struct target_nrg
+schedtune_target_nrg = {
+
+	/*
+	 * TC2 Min CPUs power:
+	 * all CPUs idle, all clusters in deep idle:
+	 *   0 * 3 + 0 * 2 + 10 + 25
+	 */
+	.min_power = 35,
+
+	/*
+	 * TC2 Max CPUs power:
+	 * all CPUs fully utilized while running at max OPP:
+	 *   1024 * 3 + 6997 * 2 + 4905 + 15200
+	 */
+
+	.max_power = 37171,
+
+	/*
+	 * Fast integer division by constant:
+	 *  Constant   : Max - Min       (C) = 37171 - 35 = 37136
+	 *  Precision  : 0.1%            (P) = 0.1
+	 *  Reference  : C * 100 / P     (R) = 3713600
+	 *
+	 * Thus:
+	 *  Shift bifs : ceil(log(R,2))  (S) = 26
+	 *  Mult const : round(2^S/C)    (M) = 1807
+	 *
+	 * This allows to compute the normalized energy:
+	 *   system_energy / C
+	 * as:
+	 *   (system_energy * M) >> S
+	 */
+	.nrg_shift = 26,	/* S */
+	.nrg_mult  = 1807,	/* M */
+};
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+int
+schedtune_normalize_energy(int energy_diff)
+{
+	long long normalized_nrg = energy_diff;
+	int max_delta;
+
+	/* Check for boundaries */
+	max_delta  = schedtune_target_nrg.max_power;
+	max_delta -= schedtune_target_nrg.min_power;
+	WARN_ON(abs(energy_diff) >= max_delta);
+
+	/* Scale by energy magnitude */
+	normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+	/* Normalize on max energy for target platform */
+	normalized_nrg  *= schedtune_target_nrg.nrg_mult;
+	normalized_nrg >>= schedtune_target_nrg.nrg_shift;
+
+	return normalized_nrg;
+}
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+		int perf_boost_idx, int perf_constrain_idx) {
+	int energy_payoff;
+
+	/* Performance Boost (B) region */
+	if (nrg_delta > 0 && cap_delta > 0) {
+		/*
+		 * energy_payoff criteria:
+		 *    cap_delta / nrg_delta > cap_gain / nrg_gain
+		 * which is:
+		 *    nrg_delta * cap_gain < cap_delta * nrg_gain
+		 */
+		energy_payoff  = cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
+		energy_payoff -= nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
+
+		trace_sched_tune_filter(
+				threshold_gains[perf_boost_idx].nrg_gain,
+				threshold_gains[perf_boost_idx].cap_gain,
+				energy_payoff, 8);
+
+		return energy_payoff;
+	}
+
+	/* Performance Constraint (C) region */
+	if (nrg_delta < 0 && cap_delta < 0) {
+		/*
+		 * energy_payoff criteria:
+		 *    cap_delta / nrg_delta < cap_gain / nrg_gain
+		 * which is:
+		 *    nrg_delta * cap_gain > cap_delta * nrg_gain
+		 */
+		energy_payoff  = nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
+		energy_payoff -= cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
+
+		trace_sched_tune_filter(
+				threshold_gains[perf_constrain_idx].nrg_gain,
+				threshold_gains[perf_constrain_idx].cap_gain,
+				energy_payoff, 6);
+
+		return energy_payoff;
+	}
+
+	/* Default: reject schedule candidate */
+	return -INT_MAX;
+}
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+/*
+ * EAS scheduler tunables for task groups.
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+	/* SchedTune CGroup subsystem */
+	struct cgroup_subsys_state css;
+
+	/* Boost group allocated ID */
+	int idx;
+
+	/* Boost value for tasks on that SchedTune CGroup */
+	int boost;
+
+	/* Performance Boost (B) region threshold params */
+	int perf_boost_idx;
+
+	/* Performance Constraint (C) region threshold params */
+	int perf_constrain_idx;
+
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+	return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+	return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boosting tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to all tasks not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+	.boost			= 0,
+	.perf_boost_idx 	= 0,
+	.perf_constrain_idx 	= 0,
+};
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task) {
+	struct schedtune *ct;
+	int perf_boost_idx;
+	int perf_constrain_idx;
+
+	/* Optimal (O) region */
+	if (nrg_delta < 0 && cap_delta > 0) {
+		trace_sched_tune_filter(0, 0, 1, 0);
+		return INT_MAX;
+	}
+
+	/* Suboptimal (S) region */
+	if (nrg_delta > 0 && cap_delta < 0) {
+		trace_sched_tune_filter(0, 0, -1, 5);
+		return -INT_MAX;
+	}
+
+	/* Get task specific perf Boost/Constraints indexes */
+	rcu_read_lock();
+	ct = task_schedtune(task);
+	perf_boost_idx = ct->perf_boost_idx;
+	perf_constrain_idx = ct->perf_constrain_idx;
+	rcu_read_unlock();
+
+	return __schedtune_accept_deltas(nrg_delta, cap_delta,
+			perf_boost_idx, perf_constrain_idx);
+
+}
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allows only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ *    make sense to boost with different values (e.g. backgroud vs foreground
+ *    tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ *    implementation especially for the computation of the per-CPU boost
+ *    value
+ */
+#define BOOSTGROUPS_COUNT 16
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+	&root_schedtune,
+	NULL,
+};
+
+/* SchedTune boost groups
+ * Each CPU in the system could be affected by multiple boost groups, for
+ * example when a CPU has two RUNNABLE tasks beloging to two different boost
+ * groups and thus likely with different boost values.
+ * This data structure keep track of all the boost groups which could impact
+ * on a CPU.
+ * Since on each system we expect only a limited number of boost
+ * groups, here we use a simple array to keep track of the metrics required to
+ * compute the maximum per-CPU boosting value.
+ */
+struct boost_groups {
+	/* Maximum boost value for all RUNNABLE tasks on a CPU */
+	unsigned boost_max;
+	struct {
+		/* The boost for tasks on that boost group */
+		unsigned boost;
+		/* Count of RUNNABLE tasks on that boost group */
+		unsigned tasks;
+	} group[BOOSTGROUPS_COUNT];
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static void
+schedtune_cpu_update(int cpu)
+{
+	struct boost_groups *bg;
+	unsigned boost_max;
+	int idx;
+
+	bg = &per_cpu(cpu_boost_groups, cpu);
+
+	/* The root boost group is always active */
+	boost_max = bg->group[0].boost;
+	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+		/*
+		 * A boost group affects a CPU only if it has
+		 * RUNNABLE tasks on that CPU
+		 */
+		if (bg->group[idx].tasks == 0)
+			continue;
+		boost_max = max(boost_max, bg->group[idx].boost);
+	}
+
+	bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+	struct boost_groups *bg;
+	int cur_boost_max;
+	int old_boost;
+	int cpu;
+
+	/* Update per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+
+		/*
+		 * Keep track of current boost values to compute the per CPU
+		 * maximum only when it has been affected by the new value of
+		 * the updated boost group
+		 */
+		cur_boost_max = bg->boost_max;
+		old_boost = bg->group[idx].boost;
+
+		/* Update the boost value of this boost group */
+		bg->group[idx].boost = boost;
+
+		/* Check if this update increase current max */
+		if (boost > cur_boost_max && bg->group[idx].tasks) {
+			bg->boost_max = boost;
+			trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
+			continue;
+		}
+
+		/* Check if this update has decreased current max */
+		if (cur_boost_max == old_boost && old_boost > boost) {
+			schedtune_cpu_update(cpu);
+			trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+			continue;
+		}
+
+		trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
+	}
+
+	return 0;
+}
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+	struct boost_groups *bg;
+	int tasks;
+
+	bg = &per_cpu(cpu_boost_groups, cpu);
+
+	/* Update boosted tasks count while avoiding to make it negative */
+	if (task_count < 0 && bg->group[idx].tasks <= -task_count)
+		bg->group[idx].tasks = 0;
+	else
+		bg->group[idx].tasks += task_count;
+
+	/* Boost group activation or deactivation on that RQ */
+	tasks = bg->group[idx].tasks;
+	if (tasks == 1 || tasks == 0)
+		schedtune_cpu_update(cpu);
+
+	trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+			bg->group[idx].boost, bg->boost_max);
+
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+	struct schedtune *st;
+	int idx;
+
+	/*
+	 * When a task is marked PF_EXITING by do_exit() it's going to be
+	 * dequeued and enqueued multiple times in the exit path.
+	 * Thus we avoid any further update, since we do not want to change
+	 * CPU boosting while the task is exiting.
+	 */
+	if (p->flags & PF_EXITING)
+		return;
+
+	/* Get task boost group */
+	rcu_read_lock();
+	st = task_schedtune(p);
+	idx = st->idx;
+	rcu_read_unlock();
+
+	schedtune_tasks_update(p, cpu, idx, 1);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+	struct schedtune *st;
+	int idx;
+
+	/*
+	 * When a task is marked PF_EXITING by do_exit() it's going to be
+	 * dequeued and enqueued multiple times in the exit path.
+	 * Thus we avoid any further update, since we do not want to change
+	 * CPU boosting while the task is exiting.
+	 * The last dequeue will be done by cgroup exit() callback.
+	 */
+	if (p->flags & PF_EXITING)
+		return;
+
+	/* Get task boost group */
+	rcu_read_lock();
+	st = task_schedtune(p);
+	idx = st->idx;
+	rcu_read_unlock();
+
+	schedtune_tasks_update(p, cpu, idx, -1);
+}
+
+int schedtune_taskgroup_boost(struct task_struct *p)
+{
+	struct schedtune *ct;
+	int task_boost;
+
+	rcu_read_lock();
+	ct = task_schedtune(p);
+	task_boost = ct->boost;
+	rcu_read_unlock();
+
+	return task_boost;
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+	struct boost_groups *bg;
+	bg = &per_cpu(cpu_boost_groups, cpu);
+	return 	bg->boost_max;
+}
+
+static u64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+	return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+			  u64 boost)
+{
+	struct schedtune *st = css_st(css);
+	int err = 0;
+
+	if (boost < 0 || boost > 100) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	st->boost = boost;
+	if (css == &root_schedtune.css)
+		sysctl_sched_cfs_boost = boost;
+
+	if (boost == 100)
+		boost = 99;
+
+	/* Performance Boost (B) region threshold params */
+	st->perf_boost_idx  = boost;
+	st->perf_boost_idx /= 10;
+
+	/* Performance Constraint (C) region threshold params */
+	st->perf_constrain_idx  = 100 - boost;
+	st->perf_constrain_idx /= 10;
+
+	/* Update CPU boost */
+	schedtune_boostgroup_update(st->idx, st->boost);
+
+	trace_sched_tune_config(st->boost,
+			threshold_gains[st->perf_boost_idx].nrg_gain,
+			threshold_gains[st->perf_boost_idx].cap_gain,
+			threshold_gains[st->perf_constrain_idx].nrg_gain,
+			threshold_gains[st->perf_constrain_idx].cap_gain);
+
+out:
+	return err;
+}
+
+static struct cftype files[] = {
+	{
+		.name = "boost",
+		.read_u64 = boost_read,
+		.write_u64 = boost_write,
+	},
+	{ }	/* terminate */
+};
+
+static int
+schedtune_boostgroup_init(struct schedtune *st)
+{
+	struct boost_groups *bg;
+	int cpu;
+
+	/* Keep track of allocated boost group */
+	allocated_group[st->idx] = st;
+
+	/* Initialize the per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		bg->group[st->idx].boost = 0;
+		bg->group[st->idx].tasks = 0;
+	}
+
+	return 0;
+}
+
+static int
+schedtune_init(void)
+{
+	struct boost_groups *bg;
+	int cpu;
+
+	/* Initialize the per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		memset(bg, 0 , sizeof(struct boost_groups));
+	}
+
+	pr_info("  schedtune configured to support %d boost groups\n",
+			BOOSTGROUPS_COUNT);
+	return 0;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct schedtune *st;
+	int idx;
+
+	if (!parent_css) {
+		schedtune_init();
+		return &root_schedtune.css;
+	}
+
+	/* Allows only single level hierachies */
+	if (parent_css != &root_schedtune.css) {
+		pr_err("Nested SchedTune boosting groups not allowed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Allows only a limited number of boosting groups */
+	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+		if (allocated_group[idx] == NULL)
+			break;
+	if (idx == BOOSTGROUPS_COUNT) {
+		pr_err("Trying to create more than %d SchedTune boosting groups\n",
+				BOOSTGROUPS_COUNT);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	st = kzalloc(sizeof(*st), GFP_KERNEL);
+	if (!st)
+		goto out;
+
+	/* Initialize per CPUs boost group support */
+	st->idx = idx;
+	if (schedtune_boostgroup_init(st))
+		goto release;
+
+	return &st->css;
+
+release:
+	kfree(st);
+out:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+	/* Reset this group boost */
+	schedtune_boostgroup_update(st->idx, 0);
+
+	/* Keep track of allocated boost group */
+	allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+	struct schedtune *st = css_st(css);
+	schedtune_boostgroup_release(st);
+	kfree(st);
+}
+
+static void
+schedtune_exit(struct cgroup_subsys_state *css,
+		struct cgroup_subsys_state *old_css,
+		struct task_struct *tsk)
+{
+	struct schedtune *old_st = css_st(old_css);
+	int cpu = task_cpu(tsk);
+
+	schedtune_tasks_update(tsk, cpu, old_st->idx, -1);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+	.css_alloc	= schedtune_css_alloc,
+	.css_free	= schedtune_css_free,
+	.exit		= schedtune_exit,
+	.legacy_cftypes	= files,
+	.early_init	= 1,
+};
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta) {
+
+	/* Optimal (O) region */
+	if (nrg_delta < 0 && cap_delta > 0) {
+		trace_printk("schedtune_filter: region=O ngain=0 pgain=0 nrg_payoff=-1");
+		return INT_MAX;
+	}
+
+	/* Suboptimal (S) region */
+	if (nrg_delta > 0 && cap_delta < 0) {
+		trace_printk("schedtune_filter: region=S ngain=0 pgain=0 nrg_payoff=1");
+		return -INT_MAX;
+	}
+
+	return __schedtune_accept_deltas(nrg_delta, cap_delta,
+			perf_boost_idx, perf_constrain_idx);
+
+}
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	/* Performance Boost (B) region threshold params */
+	perf_boost_idx  = sysctl_sched_cfs_boost;
+	perf_boost_idx /= 10;
+
+	/* Performance Constraint (C) region threshold params */
+	perf_constrain_idx  = 100 - sysctl_sched_cfs_boost;
+	perf_constrain_idx /= 10;
+
+	return 0;
+}
+