aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVincent Guittot <vincent.guittot@linaro.org>2022-11-08 16:21:20 +0100
committerVincent Guittot <vincent.guittot@linaro.org>2022-11-09 12:13:35 +0100
commitf864e961417814724e7f273055d58e8a855a0521 (patch)
treea4e3ebf3256e240cb2ea52b621f43ce48ac2d369
parent2f7003400273ea7ac4741bb228b84073e7b1b829 (diff)
sched/fair: Use augmented rbtree instead of 2 rbtreessched/latency-nice
Use augmented rbtree to save a wakeup_vruntime for the latency sensitive tasks in addition to the normal vruntime instead of using a dedicated rbtree. a wakeup_vruntime field is added on sched_entity that will be set at enqueue to take into account the latency offset wheen needed. A min_vruntime field is also added to the sched_entity for the augmented function to keep track of the minimum wakeup_vruntime of the rb subtree of a node. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/fair.c168
-rw-r--r--kernel/sched/sched.h1
4 files changed, 118 insertions, 55 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 45b8c36e64cc..a7e32c33c88c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -547,7 +547,6 @@ struct sched_entity {
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
- struct rb_node latency_node;
struct list_head group_node;
unsigned int on_rq;
@@ -555,6 +554,8 @@ struct sched_entity {
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
+ u64 wakeup_vruntime;
+ u64 min_vruntime; /* rb augmented field */
u64 nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6c67677d71f..3f42b1f61a7e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4342,7 +4342,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
- RB_CLEAR_NODE(&p->se.latency_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2c75d531612..0bd8c7b94149 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -50,6 +50,7 @@
#include <asm/switch_to.h>
#include <linux/sched/cond_resched.h>
+#include <linux/rbtree_augmented.h>
#include "sched.h"
#include "stats.h"
@@ -616,22 +617,74 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
max_vruntime(cfs_rq->min_vruntime, vruntime));
}
-static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+static inline bool wakeup_preempt_vruntime(struct sched_entity *se, bool exit)
{
- return entity_before(__node_2_se(a), __node_2_se(b));
+ struct sched_entity *child;
+ u64 min = se->wakeup_vruntime;
+ if (se->run_node.rb_left) {
+ child = rb_entry(se->run_node.rb_left, struct sched_entity, run_node);
+ if (child->min_vruntime < min)
+ min = child->min_vruntime;
+ }
+ if (se->run_node.rb_right) {
+ child = rb_entry(se->run_node.rb_right, struct sched_entity, run_node);
+ if (child->min_vruntime < min)
+ min = child->min_vruntime;
+ }
+ if (exit && se->min_vruntime == min)
+ return true;
+ se->min_vruntime = min;
+ return false;
}
+RB_DECLARE_CALLBACKS(static, wakeup_preempt_augmented,
+ struct sched_entity, run_node, min_vruntime, wakeup_preempt_vruntime)
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+ struct rb_root_cached *tree = &cfs_rq->tasks_timeline;
+ struct rb_node **link = &tree->rb_root.rb_node;
+ struct rb_node *node = &se->run_node;
+ struct rb_node *rb_parent = NULL;
+ struct sched_entity *parent;
+ bool leftmost = true;
+
+ /*
+ * Because of the augmented function, we can't use directly
+ * rb_add_cached().
+ */
+ while (*link) {
+ rb_parent = *link;
+ parent = rb_entry(rb_parent, struct sched_entity, run_node);
+
+ /*
+ * Update parent augmented field to keep track of the minimum wakeup
+ * vruntime of the subtree.
+ */
+ if (se->wakeup_vruntime < parent->min_vruntime)
+ parent->min_vruntime = se->wakeup_vruntime;
+
+ if (entity_before(se, parent)) {
+ link = &rb_parent->rb_left;
+ } else {
+ link = &rb_parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ se->min_vruntime = se->wakeup_vruntime;
+ rb_link_node(node, rb_parent, link);
+ rb_insert_augmented_cached(node, tree, leftmost,
+ &wakeup_preempt_augmented);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &wakeup_preempt_augmented);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -666,44 +719,23 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
}
#endif
-/**************************************************************
- * Scheduling class tree data structure manipulation methods:
- * for latency
- */
-
-static inline bool latency_before(struct sched_entity *a,
- struct sched_entity *b)
-{
- return (s64)(a->vruntime + a->latency_offset - b->vruntime - b->latency_offset) < 0;
-}
-
-#define __latency_node_2_se(node) \
- rb_entry((node), struct sched_entity, latency_node)
-
-static inline bool __latency_less(struct rb_node *a, const struct rb_node *b)
-{
- return latency_before(__latency_node_2_se(a), __latency_node_2_se(b));
-}
-
/*
- * Enqueue an entity into the latency rb-tree:
+ * Compute the wakeup vruntime that can be used to preempt current.
*/
-static void __enqueue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void __enqueue_latency(struct sched_entity *se, int flags)
{
+ se->wakeup_vruntime = se->vruntime;
- /* Only latency sensitive entity can be added to the list */
+ /* Only latency sensitive entity got special preemption vruntime */
if (se->latency_offset >= 0)
return;
- if (!RB_EMPTY_NODE(&se->latency_node))
- return;
-
/*
* An execution time less than sysctl_sched_min_granularity means that
* the entity has been preempted by a higher sched class or an entity
* with higher latency constraint.
- * Put it back in the list so it gets a chance to run 1st during the
- * next slice.
+ * Keep preemption property for such task so it gets a chance to run
+ * 1st during the next slice.
*/
if (!(flags & ENQUEUE_WAKEUP)) {
u64 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
@@ -712,25 +744,61 @@ static void __enqueue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se, in
return;
}
- rb_add_cached(&se->latency_node, &cfs_rq->latency_timeline, __latency_less);
+ /* Waking task gets a wakeup vruntime */
+ se->wakeup_vruntime += se->latency_offset;
}
-static void __dequeue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- if (!RB_EMPTY_NODE(&se->latency_node)) {
- rb_erase_cached(&se->latency_node, &cfs_rq->latency_timeline);
- RB_CLEAR_NODE(&se->latency_node);
- }
-}
+/*
+ * Search an entity with a wakeup vruntime that matches a vruntime.
+ */
-static struct sched_entity *__pick_first_latency(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_latency(struct cfs_rq *cfs_rq, u64 vruntime)
{
- struct rb_node *left = rb_first_cached(&cfs_rq->latency_timeline);
+ struct rb_root_cached *tree = &cfs_rq->tasks_timeline;
+ struct rb_node *node = tree->rb_root.rb_node;
+ struct sched_entity *se;
- if (!left)
+ if (!node)
return NULL;
- return __latency_node_2_se(left);
+ se = rb_entry(node, struct sched_entity, run_node);
+
+ /* There is no pending latency sensitive task */
+ if (se->min_vruntime >= vruntime)
+ return NULL;
+
+ /* There is at least one wakeup vrutime before the target vruntime */
+ while (true) {
+ if (se->run_node.rb_left) {
+ struct sched_entity *left = rb_entry(se->run_node.rb_left,
+ struct sched_entity, run_node);
+
+ if (left->min_vruntime <= se->min_vruntime) {
+ /* left has a lower wakeup_vruntime */
+ se = left;
+ continue;
+ }
+ }
+
+ /* This node has the lower wakeup_vruntime */
+ if (se->wakeup_vruntime <= se->min_vruntime)
+ return se;
+
+ if (se->run_node.rb_right) {
+ struct sched_entity *right = rb_entry(se->run_node.rb_right,
+ struct sched_entity, run_node);
+
+ if (right->min_vruntime <= se->min_vruntime) {
+ /* right has a lower wakeup_vruntime */
+ se = right;
+ continue;
+ }
+ }
+
+ /* Should never happen */
+ SCHED_WARN_ON(true);
+ return NULL;
+ }
}
#ifdef CONFIG_SCHED_DEBUG
@@ -4509,8 +4577,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_enqueue_fair(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr) {
+ __enqueue_latency(se, flags);
__enqueue_entity(cfs_rq, se);
- __enqueue_latency(cfs_rq, se, flags);
}
se->on_rq = 1;
@@ -4597,10 +4665,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
- if (se != cfs_rq->curr) {
+ if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
- __dequeue_latency(cfs_rq, se);
- }
+
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
@@ -4689,7 +4756,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- __dequeue_latency(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
}
@@ -4771,7 +4837,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
}
/* Check for latency sensitive entity waiting for running */
- latency = __pick_first_latency(cfs_rq);
+ latency = __pick_first_latency(cfs_rq, left->wakeup_vruntime);
if (latency && (latency != se) &&
wakeup_preempt_entity(latency, se) < 1)
se = latency;
@@ -4798,8 +4864,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
+ __enqueue_latency(prev, 0);
__enqueue_entity(cfs_rq, prev);
- __enqueue_latency(cfs_rq, prev, 0);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
@@ -11763,7 +11829,6 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->latency_timeline = RB_ROOT_CACHED;
u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
@@ -12077,7 +12142,6 @@ int sched_group_set_latency(struct task_group *tg, s64 latency)
rq_lock_irqsave(rq, &rf);
- __dequeue_latency(se->cfs_rq, se);
WRITE_ONCE(se->latency_offset, latency);
rq_unlock_irqrestore(rq, &rf);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91ec36c1158b..95d4be4f3af6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -599,7 +599,6 @@ struct cfs_rq {
#endif
struct rb_root_cached tasks_timeline;
- struct rb_root_cached latency_timeline;
/*
* 'curr' points to currently running entity on this cfs_rq.