sched/fair: Use augmented rbtree instead of 2 rbtreessched/latency-nice

Use augmented rbtree to save a wakeup_vruntime for the latency sensitive tasks in addition to the normal vruntime instead of using a dedicated rbtree. a wakeup_vruntime field is added on sched_entity that will be set at enqueue to take into account the latency offset wheen needed. A min_vruntime field is also added to the sched_entity for the augmented function to keep track of the minimum wakeup_vruntime of the rb subtree of a node. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
author: Vincent Guittot <vincent.guittot@linaro.org> 2022-11-08 16:21:20 +0100
committer: Vincent Guittot <vincent.guittot@linaro.org> 2022-11-09 12:13:35 +0100
commit: f864e961417814724e7f273055d58e8a855a0521 (patch)
tree: a4e3ebf3256e240cb2ea52b621f43ce48ac2d369
parent: 2f7003400273ea7ac4741bb228b84073e7b1b829 (diff)
4 files changed, 118 insertions, 55 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 45b8c36e64cc..a7e32c33c88c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -547,7 +547,6 @@ struct sched_entity {
 	/* For load-balancing: */
 	struct load_weight		load;
 	struct rb_node			run_node;
-	struct rb_node			latency_node;
 	struct list_head		group_node;
 	unsigned int			on_rq;
 
@@ -555,6 +554,8 @@ struct sched_entity {
 	u64				sum_exec_runtime;
 	u64				vruntime;
 	u64				prev_sum_exec_runtime;
+	u64				wakeup_vruntime;
+	u64				min_vruntime; /* rb augmented field */
 
 	u64				nr_migrations;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6c67677d71f..3f42b1f61a7e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4342,7 +4342,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
-	RB_CLEAR_NODE(&p->se.latency_node);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2c75d531612..0bd8c7b94149 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -50,6 +50,7 @@
 #include <asm/switch_to.h>
 
 #include <linux/sched/cond_resched.h>
+#include <linux/rbtree_augmented.h>
 
 #include "sched.h"
 #include "stats.h"
@@ -616,22 +617,74 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 		      max_vruntime(cfs_rq->min_vruntime, vruntime));
 }
 
-static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+static inline bool wakeup_preempt_vruntime(struct sched_entity *se, bool exit)
 {
-	return entity_before(__node_2_se(a), __node_2_se(b));
+	struct sched_entity *child;
+	u64 min = se->wakeup_vruntime;
+	if (se->run_node.rb_left) {
+		child = rb_entry(se->run_node.rb_left, struct sched_entity, run_node);
+		if (child->min_vruntime < min)
+			min = child->min_vruntime;
+	}
+	if (se->run_node.rb_right) {
+		child = rb_entry(se->run_node.rb_right, struct sched_entity, run_node);
+		if (child->min_vruntime < min)
+			min = child->min_vruntime;
+	}
+	if (exit && se->min_vruntime == min)
+		return true;
+	se->min_vruntime = min;
+	return false;
 }
 
+RB_DECLARE_CALLBACKS(static, wakeup_preempt_augmented,
+		     struct sched_entity, run_node, min_vruntime, wakeup_preempt_vruntime)
+
 /*
  * Enqueue an entity into the rb-tree:
  */
 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+	struct rb_root_cached *tree = &cfs_rq->tasks_timeline;
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *node = &se->run_node;
+	struct rb_node *rb_parent = NULL;
+	struct sched_entity *parent;
+	bool leftmost = true;
+
+	/*
+	 * Because of the augmented function, we can't use directly
+	 * rb_add_cached().
+	 */
+	while (*link) {
+		rb_parent = *link;
+		parent = rb_entry(rb_parent, struct sched_entity, run_node);
+
+		/*
+		 * Update parent augmented field to keep track of the minimum wakeup
+		 * vruntime of the subtree.
+		 */
+		if (se->wakeup_vruntime < parent->min_vruntime)
+			parent->min_vruntime = se->wakeup_vruntime;
+
+		if (entity_before(se, parent)) {
+			link = &rb_parent->rb_left;
+		} else {
+			link = &rb_parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	se->min_vruntime = se->wakeup_vruntime;
+	rb_link_node(node, rb_parent, link);
+	rb_insert_augmented_cached(node, tree, leftmost,
+			&wakeup_preempt_augmented);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+			&wakeup_preempt_augmented);
 }
 
 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -666,44 +719,23 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 }
 #endif
 
-/**************************************************************
- * Scheduling class tree data structure manipulation methods:
- * for latency
- */
-
-static inline bool latency_before(struct sched_entity *a,
-				struct sched_entity *b)
-{
-	return (s64)(a->vruntime + a->latency_offset - b->vruntime - b->latency_offset) < 0;
-}
-
-#define __latency_node_2_se(node) \
-	rb_entry((node), struct sched_entity, latency_node)
-
-static inline bool __latency_less(struct rb_node *a, const struct rb_node *b)
-{
-	return latency_before(__latency_node_2_se(a), __latency_node_2_se(b));
-}
-
 /*
- * Enqueue an entity into the latency rb-tree:
+ * Compute the wakeup vruntime that can be used to preempt current.
  */
-static void __enqueue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void __enqueue_latency(struct sched_entity *se, int flags)
 {
+	se->wakeup_vruntime = se->vruntime;
 
-	/* Only latency sensitive entity can be added to the list */
+	/* Only latency sensitive entity got special preemption vruntime */
 	if (se->latency_offset >= 0)
 		return;
 
-	if (!RB_EMPTY_NODE(&se->latency_node))
-		return;
-
 	/*
 	 * An execution time less than sysctl_sched_min_granularity means that
 	 * the entity has been preempted by a higher sched class or an entity
 	 * with higher latency constraint.
-	 * Put it back in the list so it gets a chance to run 1st during the
-	 * next slice.
+	 * Keep preemption property for such task so it gets a chance to run
+	 * 1st during the next slice.
 	 */
 	if (!(flags & ENQUEUE_WAKEUP)) {
 		u64 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
@@ -712,25 +744,61 @@ static void __enqueue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se, in
 			return;
 	}
 
-	rb_add_cached(&se->latency_node, &cfs_rq->latency_timeline, __latency_less);
+	/* Waking task gets a wakeup vruntime */
+	se->wakeup_vruntime += se->latency_offset;
 }
 
-static void __dequeue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	if (!RB_EMPTY_NODE(&se->latency_node)) {
-		rb_erase_cached(&se->latency_node, &cfs_rq->latency_timeline);
-		RB_CLEAR_NODE(&se->latency_node);
-	}
-}
+/*
+ * Search an entity with a wakeup vruntime that matches a vruntime.
+ */
 
-static struct sched_entity *__pick_first_latency(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_latency(struct cfs_rq *cfs_rq, u64 vruntime)
 {
-	struct rb_node *left = rb_first_cached(&cfs_rq->latency_timeline);
+	struct rb_root_cached *tree = &cfs_rq->tasks_timeline;
+	struct rb_node *node = tree->rb_root.rb_node;
+	struct sched_entity *se;
 
-	if (!left)
+	if (!node)
 		return NULL;
 
-	return __latency_node_2_se(left);
+	se = rb_entry(node, struct sched_entity, run_node);
+
+	/* There is no pending latency sensitive task */
+	if (se->min_vruntime >= vruntime)
+		return NULL;
+
+	/* There is at least one wakeup vrutime before the target vruntime */
+	while (true) {
+		if (se->run_node.rb_left) {
+			struct sched_entity *left = rb_entry(se->run_node.rb_left,
+					struct sched_entity, run_node);
+
+			if (left->min_vruntime <= se->min_vruntime) {
+				/* left has a lower wakeup_vruntime */
+				se = left;
+				continue;
+			}
+		}
+
+		/* This node has the lower wakeup_vruntime */
+		if (se->wakeup_vruntime <= se->min_vruntime)
+			return se;
+
+		if (se->run_node.rb_right) {
+			struct sched_entity *right = rb_entry(se->run_node.rb_right,
+					struct sched_entity, run_node);
+
+			if (right->min_vruntime <= se->min_vruntime) {
+				/* right has a lower wakeup_vruntime */
+				se = right;
+				continue;
+			}
+		}
+
+		/* Should never happen */
+		SCHED_WARN_ON(true);
+		return NULL;
+	}
 }
 
 #ifdef CONFIG_SCHED_DEBUG
@@ -4509,8 +4577,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_stats_enqueue_fair(cfs_rq, se, flags);
 	check_spread(cfs_rq, se);
 	if (!curr) {
+		__enqueue_latency(se, flags);
 		__enqueue_entity(cfs_rq, se);
-		__enqueue_latency(cfs_rq, se, flags);
 	}
 	se->on_rq = 1;
 
@@ -4597,10 +4665,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	clear_buddies(cfs_rq, se);
 
-	if (se != cfs_rq->curr) {
+	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
-		__dequeue_latency(cfs_rq, se);
-	}
+
 	se->on_rq = 0;
 	account_entity_dequeue(cfs_rq, se);
 
@@ -4689,7 +4756,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end_fair(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
-		__dequeue_latency(cfs_rq, se);
 		update_load_avg(cfs_rq, se, UPDATE_TG);
 	}
 
@@ -4771,7 +4837,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	}
 
 	/* Check for latency sensitive entity waiting for running */
-	latency = __pick_first_latency(cfs_rq);
+	latency = __pick_first_latency(cfs_rq, left->wakeup_vruntime);
 	if (latency && (latency != se) &&
 	    wakeup_preempt_entity(latency, se) < 1)
 		se = latency;
@@ -4798,8 +4864,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq) {
 		update_stats_wait_start_fair(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
+		__enqueue_latency(prev, 0);
 		__enqueue_entity(cfs_rq, prev);
-		__enqueue_latency(cfs_rq, prev, 0);
 		/* in !on_rq case, update occurred at dequeue */
 		update_load_avg(cfs_rq, prev, 0);
 	}
@@ -11763,7 +11829,6 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
-	cfs_rq->latency_timeline = RB_ROOT_CACHED;
 	u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
 #ifdef CONFIG_SMP
 	raw_spin_lock_init(&cfs_rq->removed.lock);
@@ -12077,7 +12142,6 @@ int sched_group_set_latency(struct task_group *tg, s64 latency)
 
 		rq_lock_irqsave(rq, &rf);
 
-		__dequeue_latency(se->cfs_rq, se);
 		WRITE_ONCE(se->latency_offset, latency);
 
 		rq_unlock_irqrestore(rq, &rf);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91ec36c1158b..95d4be4f3af6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -599,7 +599,6 @@ struct cfs_rq {
 #endif
 
 	struct rb_root_cached	tasks_timeline;
-	struct rb_root_cached	latency_timeline;
 
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
author	Vincent Guittot <vincent.guittot@linaro.org>	2022-11-08 16:21:20 +0100
committer	Vincent Guittot <vincent.guittot@linaro.org>	2022-11-09 12:13:35 +0100
commit	f864e961417814724e7f273055d58e8a855a0521 (patch)
tree	a4e3ebf3256e240cb2ea52b621f43ce48ac2d369
parent	2f7003400273ea7ac4741bb228b84073e7b1b829 (diff)