diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 126 | ||||
-rw-r--r-- | kernel/sched/features.h | 7 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 |
4 files changed, 138 insertions, 4 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22acf9e269d..440e2dc8057 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1689,7 +1689,7 @@ static void __sched_fork(struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_migrate_seq = 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; #endif /* CONFIG_NUMA_BALANCING */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9bb1be49eb6..8b7c8a280eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -800,6 +800,60 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +static unsigned int task_nr_scan_windows(struct task_struct *p) +{ + unsigned long rss = 0; + unsigned long nr_scan_pages; + + /* + * Calculations based on RSS as non-present and empty pages are skipped + * by the PTE scanner and NUMA hinting faults should be trapped based + * on resident pages + */ + nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + rss = get_mm_rss(p->mm); + if (!rss) + rss = nr_scan_pages; + + rss = round_up(rss, nr_scan_pages); + return rss / nr_scan_pages; +} + +/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +#define MAX_SCAN_WINDOW 2560 + +static unsigned int task_scan_min(struct task_struct *p) +{ + unsigned int scan, floor; + unsigned int windows = 1; + + if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + floor = 1000 / windows; + + scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); + return max_t(unsigned int, floor, scan); +} + +static unsigned int task_scan_max(struct task_struct *p) +{ + unsigned int smin = task_scan_min(p); + unsigned int smax; + + /* Watch for min being lower than max due to floor calculations */ + smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + return max(smin, smax); +} + +/* + * Once a preferred node is selected the scheduler balancer will prefer moving + * a task to that node for sysctl_numa_balancing_settle_count number of PTE + * scans. This will give the process the chance to accumulate more faults on + * the preferred node but still allow the scheduler to move the task again if + * the nodes CPUs are overloaded. + */ +unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; + static void task_numa_placement(struct task_struct *p) { int seq; @@ -810,8 +864,30 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; + p->numa_migrate_seq++; + p->numa_scan_period_max = task_scan_max(p); + + /* Find the node with the highest number of faults */ + for_each_online_node(nid) { + unsigned long faults; + + /* Decay existing window and copy faults since last scan */ + p->numa_faults[nid] >>= 1; + p->numa_faults[nid] += p->numa_faults_buffer[nid]; + p->numa_faults_buffer[nid] = 0; + + faults = p->numa_faults[nid]; + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + } - /* FIXME: Scheduling placement policy hints go here */ + /* Update the tasks preferred node if necessary */ + if (max_faults && max_nid != p->numa_preferred_nid) { + p->numa_preferred_nid = max_nid; + p->numa_migrate_seq = 0; + } } /* @@ -4093,6 +4169,38 @@ static int task_hot(struct task_struct *p, struct lb_env *env) return delta < (s64)sysctl_sched_migration_cost; } +#ifdef CONFIG_NUMA_BALANCING +/* Returns true if the destination node has incurred more faults */ +static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +{ + int src_nid, dst_nid; + + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || + !(env->sd->flags & SD_NUMA)) { + return false; + } + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid || + p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) + return false; + + if (dst_nid == p->numa_preferred_nid || + p->numa_faults[dst_nid] > p->numa_faults[src_nid]) + return true; + + return false; +} +#else +static inline bool migrate_improves_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} +#endif + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -4150,10 +4258,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. + * 1) destination numa is preferred + * 2) task is cache cold, or + * 3) too many balance attempts have failed. */ tsk_cache_hot = task_hot(p, env); + + if (migrate_improves_locality(p, env)) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 66d2267e4cc..9e835a4cb11 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -69,4 +69,11 @@ SCHED_FEAT(LB_MIN, false) #ifdef CONFIG_NUMA_BALANCING SCHED_FEAT(NUMA, false) SCHED_FEAT(NUMA_FORCE, false) + +/* + * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a + * higher number of hinting faults are recorded during active load + * balancing. + */ +SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9469f4c61a3..98e9465fc45 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -397,6 +397,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_settle_count", + .data = &sysctl_numa_balancing_settle_count, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { |