aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernel/sched/tune.c321
-rw-r--r--kernel/sched/tune.h7
2 files changed, 328 insertions, 0 deletions
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 3253a8732ba5..f4fbbcd28373 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -1,7 +1,9 @@
#include <linux/cgroup.h>
#include <linux/err.h>
+#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/printk.h>
+#include <linux/reciprocal_div.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
@@ -9,6 +11,84 @@
unsigned int sysctl_sched_cfs_boost __read_mostly;
+/*
+ * System energy normalization constants
+ */
+static struct target_nrg {
+ unsigned long min_power;
+ unsigned long max_power;
+ struct reciprocal_value rdiv;
+} schedtune_target_nrg;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+ int nrg_gain;
+ int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+ { 0, 4 }, /* >= 0% */
+ { 0, 4 }, /* >= 10% */
+ { 1, 4 }, /* >= 20% */
+ { 2, 4 }, /* >= 30% */
+ { 3, 4 }, /* >= 40% */
+ { 4, 3 }, /* >= 50% */
+ { 4, 2 }, /* >= 60% */
+ { 4, 1 }, /* >= 70% */
+ { 4, 0 }, /* >= 80% */
+ { 4, 0 } /* >= 90% */
+};
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ int perf_boost_idx, int perf_constrain_idx)
+{
+ int payoff = -INT_MAX;
+
+ /* Performance Boost (B) region */
+ if (nrg_delta > 0 && cap_delta > 0) {
+ /*
+ * Evaluate "Performance Boost" vs "Energy Increase"
+ * payoff criteria:
+ * cap_delta / nrg_delta < cap_gain / nrg_gain
+ * which is:
+ * nrg_delta * cap_gain > cap_delta * nrg_gain
+ */
+ payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
+ payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
+ return payoff;
+ }
+
+ /* Performance Constraint (C) region */
+ if (nrg_delta < 0 && cap_delta < 0) {
+ /*
+ * Evaluate "Performance Boost" vs "Energy Increase"
+ * payoff criteria:
+ * cap_delta / nrg_delta > cap_gain / nrg_gain
+ * which is:
+ * cap_delta * nrg_gain > nrg_delta * cap_gain
+ */
+ payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
+ payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
+ return payoff;
+ }
+
+ /* Default: reject schedule candidate */
+ return payoff;
+}
+
#ifdef CONFIG_CGROUP_SCHEDTUNE
/*
@@ -26,6 +106,11 @@ struct schedtune {
/* Boost value for tasks on that SchedTune CGroup */
int boost;
+ /* Performance Boost (B) region threshold params */
+ int perf_boost_idx;
+
+ /* Performance Constraint (C) region threshold params */
+ int perf_constrain_idx;
};
static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
@@ -55,8 +140,37 @@ static inline struct schedtune *parent_st(struct schedtune *st)
static struct schedtune
root_schedtune = {
.boost = 0,
+ .perf_boost_idx = 0,
+ .perf_constrain_idx = 0,
};
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ struct schedtune *ct;
+ int perf_boost_idx;
+ int perf_constrain_idx;
+
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0)
+ return INT_MAX;
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0)
+ return -INT_MAX;
+
+ /* Get task specific perf Boost/Constraints indexes */
+ rcu_read_lock();
+ ct = task_schedtune(task);
+ perf_boost_idx = ct->perf_boost_idx;
+ perf_constrain_idx = ct->perf_constrain_idx;
+ rcu_read_unlock();
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
/*
* Maximum number of boost groups to support
* When per-task boosting is used we still allow only limited number of
@@ -396,6 +510,24 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
.early_init = 1,
};
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0)
+ return INT_MAX;
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0)
+ return -INT_MAX;
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
#endif /* CONFIG_CGROUP_SCHEDTUNE */
int
@@ -408,6 +540,195 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
+ /* Performance Boost (B) region threshold params */
+ perf_boost_idx = sysctl_sched_cfs_boost;
+ perf_boost_idx /= 10;
+
+ /* Performance Constraint (C) region threshold params */
+ perf_constrain_idx = 100 - sysctl_sched_cfs_boost;
+ perf_constrain_idx /= 10;
+
+ return 0;
+}
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+int
+schedtune_normalize_energy(int energy_diff)
+{
+ u32 normalized_nrg;
+ int max_delta;
+
+#ifdef CONFIG_SCHED_DEBUG
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+ /* Do scaling using positive numbers to increase the range */
+ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg = reciprocal_divide(
+ normalized_nrg, schedtune_target_nrg.rdiv);
+
+ return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static void
+schedtune_test_nrg(unsigned long delta_pwr)
+{
+ unsigned long test_delta_pwr;
+ unsigned long test_norm_pwr;
+ int idx;
+
+ /*
+ * Check normalization constants using some constant system
+ * energy values
+ */
+ pr_info("schedtune: verify normalization constants...\n");
+ for (idx = 0; idx < 6; ++idx) {
+ test_delta_pwr = delta_pwr >> idx;
+
+ /* Normalize on max energy for target platform */
+ test_norm_pwr = reciprocal_divide(
+ test_delta_pwr << SCHED_LOAD_SHIFT,
+ schedtune_target_nrg.rdiv);
+
+ pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
+ idx, test_delta_pwr, test_norm_pwr);
+ }
+}
+#else
+#define schedtune_test_nrg(delta_pwr)
+#endif
+
+/*
+ * Compute the min/max power consumption of a cluster and all its CPUs
+ */
+static void
+schedtune_add_cluster_nrg(
+ struct sched_domain *sd,
+ struct sched_group *sg,
+ struct target_nrg *ste)
+{
+ struct sched_domain *sd2;
+ struct sched_group *sg2;
+
+ struct cpumask *cluster_cpus;
+ char str[32];
+
+ unsigned long min_pwr;
+ unsigned long max_pwr;
+ int cpu;
+
+ /* Get Cluster energy using EM data for the first CPU */
+ cluster_cpus = sched_group_cpus(sg);
+ snprintf(str, 32, "CLUSTER[%*pbl]",
+ cpumask_pr_args(cluster_cpus));
+
+ min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
+ max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Keep track of this cluster's energy in the computation of the
+ * overall system energy
+ */
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ /* Get CPU energy using EM data for each CPU in the group */
+ for_each_cpu(cpu, cluster_cpus) {
+ /* Get a SD view for the specific CPU */
+ for_each_domain(cpu, sd2) {
+ /* Get the CPU group */
+ sg2 = sd2->groups;
+ min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
+ max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
+
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ snprintf(str, 32, "CPU[%d]", cpu);
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Assume we have EM data only at the CPU and
+ * the upper CLUSTER level
+ */
+ BUG_ON(!cpumask_equal(
+ sched_group_cpus(sg),
+ sched_group_cpus(sd2->parent->groups)
+ ));
+ break;
+ }
+ }
+}
+
+/*
+ * Initialize the constants required to compute normalized energy.
+ * The values of these constants depends on the EM data for the specific
+ * target system and topology.
+ * Thus, this function is expected to be called by the code
+ * that bind the EM to the topology information.
+ */
+static int
+schedtune_init_late(void)
+{
+ struct target_nrg *ste = &schedtune_target_nrg;
+ unsigned long delta_pwr = 0;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ pr_info("schedtune: init normalization constants...\n");
+ ste->max_power = 0;
+ ste->min_power = 0;
+
+ rcu_read_lock();
+
+ /*
+ * When EAS is in use, we always have a pointer to the highest SD
+ * which provides EM data.
+ */
+ sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
+ if (!sd) {
+ pr_info("schedtune: no energy model data\n");
+ goto nodata;
+ }
+
+ sg = sd->groups;
+ do {
+ schedtune_add_cluster_nrg(sd, sg, ste);
+ } while (sg = sg->next, sg != sd->groups);
+
+ rcu_read_unlock();
+
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ "SYSTEM", ste->min_power, ste->max_power);
+
+ /* Compute normalization constants */
+ delta_pwr = ste->max_power - ste->min_power;
+ ste->rdiv = reciprocal_value(delta_pwr);
+ pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
+ ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
+
+ schedtune_test_nrg(delta_pwr);
return 0;
+
+nodata:
+ rcu_read_unlock();
+ return -EINVAL;
}
+late_initcall(schedtune_init_late);
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index d756ce7b06e0..f7273a5d994a 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -16,9 +16,16 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu);
#endif /* CONFIG_CGROUP_SCHEDTUNE */
+int schedtune_normalize_energy(int energy);
+int schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task);
+
#else /* CONFIG_SCHED_TUNE */
#define schedtune_enqueue_task(task, cpu) do { } while (0)
#define schedtune_dequeue_task(task, cpu) do { } while (0)
+#define schedtune_normalize_energy(energy) energy
+#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
+
#endif /* CONFIG_SCHED_TUNE */