diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 3 | ||||
-rw-r--r-- | kernel/cpu.c | 10 | ||||
-rw-r--r-- | kernel/kthread.c | 273 | ||||
-rw-r--r-- | kernel/rcutree.c | 12 | ||||
-rw-r--r-- | kernel/rcutree.h | 15 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 403 | ||||
-rw-r--r-- | kernel/rcutree_trace.c | 3 | ||||
-rw-r--r-- | kernel/sched/core.c | 5 | ||||
-rw-r--r-- | kernel/sched/debug.c | 39 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1099 | ||||
-rw-r--r-- | kernel/sched/sched.h | 60 | ||||
-rw-r--r-- | kernel/smpboot.c | 233 | ||||
-rw-r--r-- | kernel/smpboot.h | 4 | ||||
-rw-r--r-- | kernel/softirq.c | 108 | ||||
-rw-r--r-- | kernel/watchdog.c | 263 | ||||
-rw-r--r-- | kernel/workqueue.c | 1144 |
16 files changed, 2199 insertions, 1475 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764..e5602d32acb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ - async.o range.o groups.o lglock.o + async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb5227a19..8ab33aeea0e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } + smpboot_park_threads(cpu); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ + smpboot_unpark_threads(cpu); cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); - goto out_release; } BUG_ON(cpu_online(cpu)); @@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out; } + ret = smpboot_create_threads(cpu); + if (ret) + goto out; + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; @@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + /* Wake the per cpu threads */ + smpboot_unpark_threads(cpu); + /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de633702..146a6fa9682 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -37,11 +37,20 @@ struct kthread_create_info }; struct kthread { - int should_stop; + unsigned long flags; + unsigned int cpu; void *data; + struct completion parked; struct completion exited; }; +enum KTHREAD_BITS { + KTHREAD_IS_PER_CPU = 0, + KTHREAD_SHOULD_STOP, + KTHREAD_SHOULD_PARK, + KTHREAD_IS_PARKED, +}; + #define to_kthread(tsk) \ container_of((tsk)->vfork_done, struct kthread, exited) @@ -52,13 +61,29 @@ struct kthread { * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ -int kthread_should_stop(void) +bool kthread_should_stop(void) { - return to_kthread(current)->should_stop; + return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); /** + * kthread_should_park - should this kthread park now? + * + * When someone calls kthread_park() on your kthread, it will be woken + * and this will return true. You should then do the necessary + * cleanup and call kthread_parkme() + * + * Similar to kthread_should_stop(), but this keeps the thread alive + * and in a park position. kthread_unpark() "restarts" the thread and + * calls the thread function again. + */ +bool kthread_should_park(void) +{ + return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); +} + +/** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * @@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task) return to_kthread(task)->data; } +static void __kthread_parkme(struct kthread *self) +{ + __set_current_state(TASK_INTERRUPTIBLE); + while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { + if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) + complete(&self->parked); + schedule(); + __set_current_state(TASK_INTERRUPTIBLE); + } + clear_bit(KTHREAD_IS_PARKED, &self->flags); + __set_current_state(TASK_RUNNING); +} + +void kthread_parkme(void) +{ + __kthread_parkme(to_kthread(current)); +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -105,9 +148,10 @@ static int kthread(void *_create) struct kthread self; int ret; - self.should_stop = 0; + self.flags = 0; self.data = data; init_completion(&self.exited); + init_completion(&self.parked); current->vfork_done = &self.exited; /* OK, tell user we're spawned, wait for stop or wakeup */ @@ -117,9 +161,11 @@ static int kthread(void *_create) schedule(); ret = -EINTR; - if (!self.should_stop) - ret = threadfn(data); + if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { + __kthread_parkme(&self); + ret = threadfn(data); + } /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } @@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create) * Returns a task_struct or ERR_PTR(-ENOMEM). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, - int node, + void *data, int node, const char namefmt[], ...) { @@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); +static void __kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); + p->flags |= PF_THREAD_BOUND; +} + /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). @@ -226,14 +278,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) WARN_ON(1); return; } - - /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + __kthread_bind(p, cpu); } EXPORT_SYMBOL(kthread_bind); /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: This helper function creates and names a kernel thread + * The thread will be woken and put into park mode. + */ +struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + void *data, unsigned int cpu, + const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + cpu); + if (IS_ERR(p)) + return p; + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; + /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ + kthread_park(p); + return p; +} + +static struct kthread *task_get_live_kthread(struct task_struct *k) +{ + struct kthread *kthread; + + get_task_struct(k); + kthread = to_kthread(k); + /* It might have exited */ + barrier(); + if (k->vfork_done != NULL) + return kthread; + return NULL; +} + +/** + * kthread_unpark - unpark a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return false, wakes it, and + * waits for it to return. If the thread is marked percpu then its + * bound to the cpu again. + */ +void kthread_unpark(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + + if (kthread) { + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu); + wake_up_process(k); + } + } + put_task_struct(k); +} + +/** + * kthread_park - park a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return true, wakes it, and + * waits for it to return. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will park without + * calling threadfn(). + * + * Returns 0 if the thread is parked, -ENOSYS if the thread exited. + * If called by the kthread itself just the park bit is set. + */ +int kthread_park(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + int ret = -ENOSYS; + + if (kthread) { + if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); + } + } + ret = 0; + } + put_task_struct(k); + return ret; +} + +/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * @@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind); */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread; + struct kthread *kthread = task_get_live_kthread(k); int ret; trace_sched_kthread_stop(k); - get_task_struct(k); - - kthread = to_kthread(k); - barrier(); /* it might have exited */ - if (k->vfork_done != NULL) { - kthread->should_stop = 1; + if (kthread) { + set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); wake_up_process(k); wait_for_completion(&kthread->exited); } @@ -360,16 +507,12 @@ repeat: struct kthread_work, node); list_del_init(&work->node); } + worker->current_work = work; spin_unlock_irq(&worker->lock); if (work) { __set_current_state(TASK_RUNNING); work->func(work); - smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ - work->done_seq = work->queue_seq; - smp_mb(); /* mb worker-b1 paired with flush-b0 */ - if (atomic_read(&work->flushing)) - wake_up_all(&work->done); } else if (!freezing(current)) schedule(); @@ -378,6 +521,19 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +/* insert @work before @pos in @worker */ +static void insert_kthread_work(struct kthread_worker *worker, + struct kthread_work *work, + struct list_head *pos) +{ + lockdep_assert_held(&worker->lock); + + list_add_tail(&work->node, pos); + work->worker = worker; + if (likely(worker->task)) + wake_up_process(worker->task); +} + /** * queue_kthread_work - queue a kthread_work * @worker: target kthread_worker @@ -395,10 +551,7 @@ bool queue_kthread_work(struct kthread_worker *worker, spin_lock_irqsave(&worker->lock, flags); if (list_empty(&work->node)) { - list_add_tail(&work->node, &worker->work_list); - work->queue_seq++; - if (likely(worker->task)) - wake_up_process(worker->task); + insert_kthread_work(worker, work, &worker->work_list); ret = true; } spin_unlock_irqrestore(&worker->lock, flags); @@ -406,6 +559,18 @@ bool queue_kthread_work(struct kthread_worker *worker, } EXPORT_SYMBOL_GPL(queue_kthread_work); +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + /** * flush_kthread_work - flush a kthread_work * @work: work to flush @@ -414,39 +579,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work); */ void flush_kthread_work(struct kthread_work *work) { - int seq = work->queue_seq; - - atomic_inc(&work->flushing); + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + struct kthread_worker *worker; + bool noop = false; - /* - * mb flush-b0 paired with worker-b1, to make sure either - * worker sees the above increment or we see done_seq update. - */ - smp_mb__after_atomic_inc(); +retry: + worker = work->worker; + if (!worker) + return; - /* A - B <= 0 tests whether B is in front of A regardless of overflow */ - wait_event(work->done, seq - work->done_seq <= 0); - atomic_dec(&work->flushing); + spin_lock_irq(&worker->lock); + if (work->worker != worker) { + spin_unlock_irq(&worker->lock); + goto retry; + } - /* - * rmb flush-b1 paired with worker-b0, to make sure our caller - * sees every change made by work->func(). - */ - smp_mb__after_atomic_dec(); -} -EXPORT_SYMBOL_GPL(flush_kthread_work); + if (!list_empty(&work->node)) + insert_kthread_work(worker, &fwork.work, work->node.next); + else if (worker->current_work == work) + insert_kthread_work(worker, &fwork.work, worker->work_list.next); + else + noop = true; -struct kthread_flush_work { - struct kthread_work work; - struct completion done; -}; + spin_unlock_irq(&worker->lock); -static void kthread_flush_work_fn(struct kthread_work *work) -{ - struct kthread_flush_work *fwork = - container_of(work, struct kthread_flush_work, work); - complete(&fwork->done); + if (!noop) + wait_for_completion(&fwork.done); } +EXPORT_SYMBOL_GPL(flush_kthread_work); /** * flush_kthread_worker - flush all current works on a kthread_worker diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4b97bba7396..a8ba38369ef 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -125,13 +125,12 @@ static int rcu_scheduler_fully_active __read_mostly; */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -1461,8 +1460,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ - rcu_stop_cpu_kthread(cpu); - rcu_node_kthread_setaffinity(rnp, -1); + rcu_boost_kthread_setaffinity(rnp, -1); /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ @@ -2516,12 +2514,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: - rcu_node_kthread_setaffinity(rnp, -1); - rcu_cpu_kthread_setrt(cpu, 1); + rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - rcu_node_kthread_setaffinity(rnp, cpu); - rcu_cpu_kthread_setrt(cpu, 0); + rcu_boost_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 19b61ac1079..1989438c68f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -192,12 +192,6 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ - struct task_struct *node_kthread_task; - /* kthread that takes care of this rcu_node */ - /* structure, for example, awakening the */ - /* per-CPU kthreads as needed. */ - unsigned int node_kthread_status; - /* State of node_kthread_task for tracing. */ } ____cacheline_internodealigned_in_smp; /* @@ -449,7 +443,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); -static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); @@ -479,15 +472,9 @@ static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index); -static void invoke_rcu_node_kthread(struct rcu_node *rnp); -static void rcu_yield(void (*f)(unsigned long), unsigned long arg); + struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e4899459f3..a0871b3cb3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include <linux/delay.h> +#include <linux/smpboot.h> #define RCU_KTHREAD_PRIO 1 @@ -1225,6 +1226,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) + wake_up_process(t); +} + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1297,17 +1308,6 @@ static int rcu_boost(struct rcu_node *rnp) } /* - * Timer handler to initiate waking up of boost kthreads that - * have yielded the CPU due to excessive numbers of tasks to - * boost. We wake up the per-rcu_node kthread, which in turn - * will wake up the booster kthread. - */ -static void rcu_boost_kthread_timer(unsigned long arg) -{ - invoke_rcu_node_kthread((struct rcu_node *)arg); -} - -/* * Priority-boosting kthread. One per leaf rcu_node and one for the * root rcu_node. */ @@ -1330,8 +1330,9 @@ static int rcu_boost_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; trace_rcu_utilization("End boost kthread@rcu_yield"); - rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + schedule_timeout_interruptible(2); trace_rcu_utilization("Start boost kthread@rcu_yield"); spincnt = 0; } @@ -1369,8 +1370,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore(&rnp->lock, flags); t = rnp->boost_kthread_task; - if (t != NULL) - wake_up_process(t); + if (t) + rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1387,8 +1388,10 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_save(flags); __this_cpu_write(rcu_cpu_has_work, 1); if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + current != __this_cpu_read(rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), + __this_cpu_read(rcu_cpu_kthread_status)); + } local_irq_restore(flags); } @@ -1401,21 +1404,6 @@ static bool rcu_is_callbacks_kthread(void) return __get_cpu_var(rcu_cpu_kthread_task) == current; } -/* - * Set the affinity of the boost kthread. The CPU-hotplug locks are - * held, so no one should be messing with the existence of the boost - * kthread. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm) -{ - struct task_struct *t; - - t = rnp->boost_kthread_task; - if (t != NULL) - set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1432,15 +1420,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * Returns zero if all is well, a negated errno otherwise. */ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index) + struct rcu_node *rnp) { + int rnp_index = rnp - &rsp->node[0]; unsigned long flags; struct sched_param sp; struct task_struct *t; if (&rcu_preempt_state != rsp) return 0; + + if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + return 0; + rsp->boost = 1; if (rnp->boost_kthread_task != NULL) return 0; @@ -1457,25 +1449,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Stop the RCU's per-CPU kthread when its CPU goes offline,. - */ -static void rcu_stop_cpu_kthread(int cpu) -{ - struct task_struct *t; - - /* Stop the CPU's kthread. */ - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t != NULL) { - per_cpu(rcu_cpu_kthread_task, cpu) = NULL; - kthread_stop(t); - } -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1483,112 +1456,22 @@ static void rcu_kthread_do_work(void) rcu_preempt_do_callbacks(); } -/* - * Wake up the specified per-rcu_node-structure kthread. - * Because the per-rcu_node kthreads are immortal, we don't need - * to do anything to keep them alive. - */ -static void invoke_rcu_node_kthread(struct rcu_node *rnp) +static void rcu_cpu_kthread_setup(unsigned int cpu) { - struct task_struct *t; - - t = rnp->node_kthread_task; - if (t != NULL) - wake_up_process(t); -} - -/* - * Set the specified CPU's kthread to run RT or not, as specified by - * the to_rt argument. The CPU-hotplug locks are held, so the task - * is not going away. - */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) -{ - int policy; struct sched_param sp; - struct task_struct *t; - - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t == NULL) - return; - if (to_rt) { - policy = SCHED_FIFO; - sp.sched_priority = RCU_KTHREAD_PRIO; - } else { - policy = SCHED_NORMAL; - sp.sched_priority = 0; - } - sched_setscheduler_nocheck(t, policy, &sp); -} - -/* - * Timer handler to initiate the waking up of per-CPU kthreads that - * have yielded the CPU due to excess numbers of RCU callbacks. - * We wake up the per-rcu_node kthread, which in turn will wake up - * the booster kthread. - */ -static void rcu_cpu_kthread_timer(unsigned long arg) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); - struct rcu_node *rnp = rdp->mynode; - atomic_or(rdp->grpmask, &rnp->wakemask); - invoke_rcu_node_kthread(rnp); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); } -/* - * Drop to non-real-time priority and yield, but only after posting a - * timer that will cause us to regain our real-time priority if we - * remain preempted. Either way, we restore our real-time priority - * before returning. - */ -static void rcu_yield(void (*f)(unsigned long), unsigned long arg) +static void rcu_cpu_kthread_park(unsigned int cpu) { - struct sched_param sp; - struct timer_list yield_timer; - int prio = current->rt_priority; - - setup_timer_on_stack(&yield_timer, f, arg); - mod_timer(&yield_timer, jiffies + 2); - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - set_user_nice(current, 19); - schedule(); - set_user_nice(current, 0); - sp.sched_priority = prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); - del_timer(&yield_timer); + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; } -/* - * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. - * This can happen while the corresponding CPU is either coming online - * or going offline. We cannot wait until the CPU is fully online - * before starting the kthread, because the various notifier functions - * can wait for RCU grace periods. So we park rcu_cpu_kthread() until - * the corresponding CPU is online. - * - * Return 1 if the kthread needs to stop, 0 otherwise. - * - * Caller must disable bh. This function can momentarily enable it. - */ -static int rcu_cpu_kthread_should_stop(int cpu) +static int rcu_cpu_kthread_should_run(unsigned int cpu) { - while (cpu_is_offline(cpu) || - !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || - smp_processor_id() != cpu) { - if (kthread_should_stop()) - return 1; - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; - per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); - local_bh_enable(); - schedule_timeout_uninterruptible(1); - if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - local_bh_disable(); - } - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - return 0; + return __get_cpu_var(rcu_cpu_has_work); } /* @@ -1596,138 +1479,35 @@ static int rcu_cpu_kthread_should_stop(int cpu) * RCU softirq used in flavors and configurations of RCU that do not * support RCU priority boosting. */ -static int rcu_cpu_kthread(void *arg) +static void rcu_cpu_kthread(unsigned int cpu) { - int cpu = (int)(long)arg; - unsigned long flags; - int spincnt = 0; - unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); - char work; - char *workp = &per_cpu(rcu_cpu_has_work, cpu); + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); + char work, *workp = &__get_cpu_var(rcu_cpu_has_work); + int spincnt; - trace_rcu_utilization("Start CPU kthread@init"); - for (;;) { - *statusp = RCU_KTHREAD_WAITING; - trace_rcu_utilization("End CPU kthread@rcu_wait"); - rcu_wait(*workp != 0 || kthread_should_stop()); + for (spincnt = 0; spincnt < 10; spincnt++) { trace_rcu_utilization("Start CPU kthread@rcu_wait"); local_bh_disable(); - if (rcu_cpu_kthread_should_stop(cpu)) { - local_bh_enable(); - break; - } *statusp = RCU_KTHREAD_RUNNING; - per_cpu(rcu_cpu_kthread_loops, cpu)++; - local_irq_save(flags); + this_cpu_inc(rcu_cpu_kthread_loops); + local_irq_disable(); work = *workp; *workp = 0; - local_irq_restore(flags); + local_irq_enable(); if (work) rcu_kthread_do_work(); local_bh_enable(); - if (*workp != 0) - spincnt++; - else - spincnt = 0; - if (spincnt > 10) { - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("End CPU kthread@rcu_yield"); - rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); - trace_rcu_utilization("Start CPU kthread@rcu_yield"); - spincnt = 0; - } - } - *statusp = RCU_KTHREAD_STOPPED; - trace_rcu_utilization("End CPU kthread@term"); - return 0; -} - -/* - * Spawn a per-CPU kthread, setting up affinity and priority. - * Because the CPU hotplug lock is held, no other CPU will be attempting - * to manipulate rcu_cpu_kthread_task. There might be another CPU - * attempting to access it during boot, but the locking in kthread_bind() - * will enforce sufficient ordering. - * - * Please note that we cannot simply refuse to wake up the per-CPU - * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, - * which can result in softlockup complaints if the task ends up being - * idle for more than a couple of minutes. - * - * However, please note also that we cannot bind the per-CPU kthread to its - * CPU until that CPU is fully online. We also cannot wait until the - * CPU is fully online before we create its per-CPU kthread, as this would - * deadlock the system when CPU notifiers tried waiting for grace - * periods. So we bind the per-CPU kthread to its CPU only if the CPU - * is online. If its CPU is not yet fully online, then the code in - * rcu_cpu_kthread() will wait until it is fully online, and then do - * the binding. - */ -static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) -{ - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - per_cpu(rcu_cpu_kthread_task, cpu) != NULL) - return 0; - t = kthread_create_on_node(rcu_cpu_kthread, - (void *)(long)cpu, - cpu_to_node(cpu), - "rcuc/%d", cpu); - if (IS_ERR(t)) - return PTR_ERR(t); - if (cpu_online(cpu)) - kthread_bind(t, cpu); - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - per_cpu(rcu_cpu_kthread_task, cpu) = t; - wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ - return 0; -} - -/* - * Per-rcu_node kthread, which is in charge of waking up the per-CPU - * kthreads when needed. We ignore requests to wake up kthreads - * for offline CPUs, which is OK because force_quiescent_state() - * takes care of this case. - */ -static int rcu_node_kthread(void *arg) -{ - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp = (struct rcu_node *)arg; - struct sched_param sp; - struct task_struct *t; - - for (;;) { - rnp->node_kthread_status = RCU_KTHREAD_WAITING; - rcu_wait(atomic_read(&rnp->wakemask) != 0); - rnp->node_kthread_status = RCU_KTHREAD_RUNNING; - raw_spin_lock_irqsave(&rnp->lock, flags); - mask = atomic_xchg(&rnp->wakemask, 0); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { - if ((mask & 0x1) == 0) - continue; - preempt_disable(); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (!cpu_online(cpu) || t == NULL) { - preempt_enable(); - continue; - } - per_cpu(rcu_cpu_has_work, cpu) = 1; - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - preempt_enable(); + if (*workp == 0) { + trace_rcu_utilization("End CPU kthread@rcu_wait"); + *statusp = RCU_KTHREAD_WAITING; + return; } } - /* NOTREACHED */ - rnp->node_kthread_status = RCU_KTHREAD_STOPPED; - return 0; + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization("Start CPU kthread@rcu_yield"); + schedule_timeout_interruptible(2); + trace_rcu_utilization("End CPU kthread@rcu_yield"); + *statusp = RCU_KTHREAD_WAITING; } /* @@ -1739,17 +1519,17 @@ static int rcu_node_kthread(void *arg) * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { + struct task_struct *t = rnp->boost_kthread_task; + unsigned long mask = rnp->qsmaskinit; cpumask_var_t cm; int cpu; - unsigned long mask = rnp->qsmaskinit; - if (rnp->node_kthread_task == NULL) + if (!t) return; - if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - cpumask_clear(cm); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); @@ -1759,62 +1539,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpumask_clear_cpu(cpu, cm); WARN_ON_ONCE(cpumask_weight(cm) == 0); } - set_cpus_allowed_ptr(rnp->node_kthread_task, cm); - rcu_boost_kthread_setaffinity(rnp, cm); + set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -/* - * Spawn a per-rcu_node kthread, setting priority and affinity. - * Called during boot before online/offline can happen, or, if - * during runtime, with the main CPU-hotplug locks held. So only - * one of these can be executing at a time. - */ -static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - unsigned long flags; - int rnp_index = rnp - &rsp->node[0]; - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - rnp->qsmaskinit == 0) - return 0; - if (rnp->node_kthread_task == NULL) { - t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->node_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = 99; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - } - return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); -} +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; /* * Spawn all kthreads -- called as soon as the scheduler is running. */ static int __init rcu_spawn_kthreads(void) { - int cpu; struct rcu_node *rnp; + int cpu; rcu_scheduler_fully_active = 1; - for_each_possible_cpu(cpu) { + for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) - (void)rcu_spawn_one_cpu_kthread(cpu); - } + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); if (NUM_RCU_NODES > 1) { rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } return 0; } @@ -1826,11 +1580,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_scheduler_fully_active) { - (void)rcu_spawn_one_cpu_kthread(cpu); - if (rnp->node_kthread_task == NULL) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - } + if (rcu_scheduler_fully_active) + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1854,19 +1605,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -#ifdef CONFIG_HOTPLUG_CPU - -static void rcu_stop_cpu_kthread(int cpu) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ -} - -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d4bc16ddd1d..6b4c76ba352 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -83,11 +83,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->nxttail[RCU_WAIT_TAIL]], ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); #ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c/%d ktl=%x", + seq_printf(m, " kt=%d/%c ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), - per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_printf(m, " b=%ld", rdp->blimit); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 468bdd44c1b..9df64ea993f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1109,6 +1109,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } @@ -1713,6 +1715,9 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + p->se.avg.decay_count = 0; +#endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea..b9d54d0d7bb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; - if (!se) - return; #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); + return; + } + + PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); @@ -85,6 +91,13 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->statistics.wait_count); #endif P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); + P(se->avg.usage_avg_sum); + P(se->avg.load_avg_contrib); + P(se->avg.decay_count); +#endif #undef PN #undef P } @@ -206,14 +219,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", - SPLIT_NS(cfs_rq->load_avg)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", - SPLIT_NS(cfs_rq->load_period)); - SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", - cfs_rq->load_contribution); - SEQ_printf(m, " .%-30s: %d\n", "load_tg", - atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic64_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); + SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg", + atomic_read(&cfs_rq->tg->usage_avg)); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe..e05636144c6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -653,9 +653,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -675,10 +672,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -801,72 +794,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) -{ - struct task_group *tg = cfs_rq->tg; - long load_avg; - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - } -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; - - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) - return; - - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; - - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; - } - - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; - } - - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); - - while (cfs_rq->load_period > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; - } - - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); -} - static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) { long tg_weight; @@ -876,8 +804,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; + tg_weight = atomic64_read(&tg->load_avg); + tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -901,27 +829,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return shares; } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} # endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) @@ -939,6 +851,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; @@ -958,18 +872,491 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } +#endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 46742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 516 /* number of full periods it takes to produce max */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2db, 0xf5257d15, 0xefe4b99b, 0xeac0c6e7, 0xe5b906e7, + 0xe0ccdeec, 0xdbfbb797, 0xd744fcca, 0xd2a81d91, 0xce248c15, 0xc9b9bd86, + 0xc5672a11, 0xc12c4cca, 0xbd08a39f, 0xb8fbaf47, 0xb504f333, 0xb123f581, + 0xad583eea, 0xa9a15ab4, 0xa5fed6a9, 0xa2704303, 0x9ef53260, 0x9b8d39b9, + 0x9837f051, 0x94f4efa8, 0x91c3d373, 0x8ea4398b, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +/* Precomputed \Sum y^k { 1<=k<=n } */ +static const u32 runnable_avg_yN_sum[] = { + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) +{ + int local_n; + if (!n) + return val; + else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* will be 32 bits if that's desirable */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) + * With a look-up table which covers k^n (n<PERIOD) + * + * To achieve constant time decay_load. + */ + if (unlikely(local_n >= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + n %= LOAD_AVG_PERIOD; + } + + val *= runnable_avg_yN_inv[local_n]; + return SRR(val, 32); +} + +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + */ +static u32 __compute_runnable_contrib(int n) { + u32 contrib = 0; + + if (likely(n <= LOAD_AVG_PERIOD)) + return runnable_avg_yN_sum[n]; + else if (unlikely(n >= LOAD_AVG_MAX_N)) + return LOAD_AVG_MAX; + + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ + do { + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + + n -= LOAD_AVG_PERIOD; + } while (n > LOAD_AVG_PERIOD); + + contrib = decay_load(contrib, n); + return contrib + runnable_avg_yN_sum[n]; } -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +/* We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p1 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable, + int running) { + u64 delta, periods; + u32 runnable_contrib; + int delta_w, decayed = 0; + + delta = now - sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; + if (runnable) + sa->runnable_avg_sum += delta_w; + if (running) + sa->usage_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + delta -= delta_w; + + /* Figure out how many additional periods this update spans */ + periods = delta / 1024; + delta %= 1024; + + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, + periods + 1); + sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + periods + 1); + sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); + + /* Efficiently calculate \sum (1..n_period) 1024*y^i */ + runnable_contrib = __compute_runnable_contrib(periods); + if (runnable) + sa->runnable_avg_sum += runnable_contrib; + if (running) + sa->usage_avg_sum += runnable_contrib; + sa->runnable_avg_period += runnable_contrib; + } + + /* Remainder of delta accrued against u_0` */ + if (runnable) + sa->runnable_avg_sum += delta; + if (running) + sa->usage_avg_sum += delta; + sa->runnable_avg_period += delta; + + return decayed; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* Synchronize an entity's decay with its parentin cfs_rq.*/ +static inline u64 __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return 0; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count += decays; + + return decays; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) +{ + struct task_group *tg = cfs_rq->tg; + s64 tg_contrib; + + tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; + tg_contrib -= cfs_rq->tg_load_contrib; + + if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic64_add(tg_contrib, &tg->load_avg); + cfs_rq->tg_load_contrib += tg_contrib; + } +} + +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + long contrib, usage_contrib; + + contrib = div_u64(sa->runnable_avg_sum << 12, + sa->runnable_avg_period + 1); + contrib -= cfs_rq->tg_runnable_contrib; + + usage_contrib = div_u64(sa->usage_avg_sum << 12, + sa->runnable_avg_period + 1); + usage_contrib -= cfs_rq->tg_usage_contrib; + + /* + * contrib/usage at this point represent deltas, only update if they + * are substantive. + */ + if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) || + (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) { + atomic_add(contrib, &tg->runnable_avg); + cfs_rq->tg_runnable_contrib += contrib; + + atomic_add(usage_contrib, &tg->usage_avg); + cfs_rq->tg_usage_contrib += usage_contrib; + } +} + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + struct task_group *tg = cfs_rq->tg; + int runnable_avg; + + u64 contrib; + + contrib = cfs_rq->tg_load_contrib * tg->shares; + se->avg.load_avg_contrib = div64_u64(contrib, + atomic64_read(&tg->load_avg) + 1); + + /* + * Unlike a task-entity, a group entity may be using >=1 cpu globally. + * However, in the case that it's using <1 cpu we need to form a + * correction term so that we contribute the same load as a task of + * equal weight. (Global runnable time is taken as a fraction over + * 2^12.) + */ + runnable_avg = atomic_read(&tg->runnable_avg); + if (runnable_avg < (1<<12)) { + se->avg.load_avg_contrib *= runnable_avg; + se->avg.load_avg_contrib /= (1<<12); + } +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} +#endif + +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_contrib = scale_load(contrib); + trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib); + contrib = se->avg.runnable_avg_sum * scale_load_down(1024); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_ratio = scale_load(contrib); + trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio); + contrib = se->avg.usage_avg_sum * scale_load_down(1024); + contrib /= (se->avg.runnable_avg_period + 1); + trace_sched_task_usage_ratio(task_of(se), scale_load(contrib)); +} + +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (entity_is_task(se)) { + __update_task_entity_contrib(se); + } else { + __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); + __update_group_entity_contrib(se); + } + + return se->avg.load_avg_contrib - old_contrib; +} + +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + u64 now; + + /* + * For a group entity we need to use their owned cfs_rq_clock_task() in + * case they are the parent of a throttled hierarchy. + */ + if (entity_is_task(se)) + now = cfs_rq_clock_task(cfs_rq); + else + now = cfs_rq_clock_task(group_cfs_rq(se)); + + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * they their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +{ + u64 now = cfs_rq_clock_task(cfs_rq) >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays && !force_update) + return; + + if (atomic64_read(&cfs_rq->removed_load)) { + u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + subtract_blocked_load_contrib(cfs_rq, removed_load); + } + + if (decays) { + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + cfs_rq->last_decay = now; + } + + __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); + update_cfs_shares(cfs_rq); +} + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + u32 contrib; + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, + runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); + contrib = rq->avg.runnable_avg_sum * scale_load_down(1024); + contrib /= (rq->avg.runnable_avg_period + 1); + trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib)); + trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg); +} + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) +{ + /* + * We track migrations using entity decay_count <= 0, on a wake-up + * migration we use a negative decay count to track the remote decays + * accumulated while sleeping. + */ + if (unlikely(se->avg.decay_count <= 0)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + if (se->avg.decay_count) { + /* + * In a wake-up migration we have to approximate the + * time sleeping. This is because we can't synchronize + * clock_task between the two cpus, and it is not + * guaranteed to be read-safe. Instead, we can + * approximate this using our carried decays, which are + * explicitly atomically readable. + */ + se->avg.last_runnable_update -= (-se->avg.decay_count) + << 20; + update_entity_load_avg(se, 0); + } + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + /* migrated tasks did not contribute to our blocked load */ + if (wakeup) { + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + update_entity_load_avg(se, 0); + } + + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !wakeup); +} + +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) +{ + update_entity_load_avg(se, 1); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !sleep); + + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } else { + se->avg.decay_count = 0; + } +} +#else +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update) {} +#endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -1096,9 +1483,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1190,9 +1576,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); - se->on_rq = 0; - update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); /* * Normalize the entity after updating the min_vruntime because the @@ -1206,7 +1591,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + se->on_rq = 0; } /* @@ -1261,6 +1646,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -1340,6 +1726,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1353,9 +1741,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_curr(cfs_rq); /* - * Update share accounting for long-running entities. + * Ensure that runnable average is periodically updated. */ - update_entity_shares_tick(cfs_rq); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq, 1); #ifdef CONFIG_SCHED_HRTICK /* @@ -1448,6 +1837,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task; + + return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1592,14 +1990,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { - u64 delta = rq->clock_task - cfs_rq->load_stamp; - - /* leaving throttled state, advance shares averaging windows */ - cfs_rq->load_stamp += delta; - cfs_rq->load_last += delta; - - /* update entity weight now that we are on_rq again */ - update_cfs_shares(cfs_rq); + /* adjust cfs_rq_clock_task() */ + cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task; } #endif @@ -1611,9 +2004,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, record last load */ + /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - update_cfs_load(cfs_rq, 0); + cfs_rq->throttled_clock_task = rq->clock_task; cfs_rq->throttle_count++; return 0; @@ -1628,7 +2021,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* account load preceding throttle */ + /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); @@ -1652,7 +2045,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_timestamp = rq->clock; + cfs_rq->throttled_clock = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -1670,10 +2063,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 0; raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - cfs_rq->throttled_timestamp = 0; update_rq_clock(rq); /* update hierarchical throttle state */ @@ -2073,8 +2465,13 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2207,12 +2604,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } - if (!se) + if (!se) { + update_rq_runnable_avg(rq, rq->nr_running); inc_nr_running(rq); + } hrtick_update(rq); } @@ -2266,12 +2665,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } - if (!se) + if (!se) { dec_nr_running(rq); + update_rq_runnable_avg(rq, 1); + } hrtick_update(rq); } @@ -2681,6 +3082,67 @@ done: return target; } +#ifdef CONFIG_SCHED_HMP +/* Heterogenous multiprocessor (HMP) optimizations + * We need to know which cpus that are fast and slow. */ +static struct cpumask hmp_fast_cpu_mask; +static struct cpumask hmp_slow_cpu_mask; + +extern void __init arch_get_fast_and_slow_cpus(struct cpumask *fast, + struct cpumask *slow); + +/* Setup fast and slow cpumasks. */ +static int __init hmp_cpu_mask_setup(void) +{ + char buf[64]; + + arch_get_fast_and_slow_cpus(&hmp_fast_cpu_mask, &hmp_slow_cpu_mask); + + printk(KERN_DEBUG "Initializing HMP scheduler:\n"); + cpulist_scnprintf(buf, 64, &hmp_fast_cpu_mask); + printk(KERN_DEBUG " fast cpus: %s\n", buf); + cpulist_scnprintf(buf, 64, &hmp_slow_cpu_mask); + printk(KERN_DEBUG " slow cpus: %s\n", buf); + + return 1; +} +early_initcall(hmp_cpu_mask_setup); + +/* Migration thresholds should be in the range [0..1023] + * hmp_up_threshold: min. load required for migrating tasks to a fast cpu + * hmp_down_threshold: max. load allowed for tasks migrating to a slow cpu + * hmp_up_prio: min. task prio for tasks migrating to faster cpus */ +unsigned int hmp_up_threshold = 512; +unsigned int hmp_down_threshold = 256; +unsigned int hmp_up_prio = 125; +static unsigned int hmp_up_migration(int cpu, struct sched_entity *se); +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); + +static unsigned int hmp_cpu_is_fast(int cpu) +{ + return cpumask_test_cpu(cpu, &hmp_fast_cpu_mask); +} + +static unsigned int hmp_cpu_is_slow(int cpu) +{ + return cpumask_test_cpu(cpu, &hmp_slow_cpu_mask); +} + +/* Select target cpu for HMP migration to fast cpu + * returns target >= nr_cpu_ids if no fast cpus in affinity mask */ +static inline unsigned int hmp_select_fast_cpu(struct task_struct *tsk) +{ + return cpumask_any_and(&hmp_fast_cpu_mask, tsk_cpus_allowed(tsk)); +} + +/* Select target cpu for HMP migration to slow cpu + * returns target >= nr_cpu_ids if no slow cpus in affinity mask */ +static inline unsigned int hmp_select_slow_cpu(struct task_struct *tsk) +{ + return cpumask_any_and(&hmp_slow_cpu_mask, tsk_cpus_allowed(tsk)); +} +#endif /* CONFIG_SCHED_HMP */ + /* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and @@ -2807,8 +3269,52 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) unlock: rcu_read_unlock(); +#ifdef CONFIG_SCHED_HMP + if (hmp_up_migration(new_cpu, &p->se)) { + cpu = hmp_select_fast_cpu(p); + if (cpu < nr_cpu_ids) + return cpu; + } + if (hmp_down_migration(new_cpu, &p->se)) { + cpu = hmp_select_slow_cpu(p); + if (cpu < nr_cpu_ids) + return cpu; + } +#endif + return new_cpu; } + +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including rq->lock state, should be made. + * Caller guarantees p->pi_lock held, but nothing else. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Load tracking: accumulate removed load so that it can be processed + * when we next update owning cfs_rq under rq->lock. Tasks contribute + * to blocked load iff they have a non-zero decay-count. + */ + if (se->avg.decay_count) { + se->avg.decay_count = -__synchronize_entity_decay(se); + atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + } +} +#endif + #endif /* CONFIG_SMP */ static unsigned long @@ -3300,51 +3806,65 @@ next: /* * update tg->load_weight by folding this cpu's load_avg */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) { - struct cfs_rq *cfs_rq; - unsigned long flags; - struct rq *rq; - - if (!tg->se[cpu]) - return 0; - - rq = cpu_rq(cpu); - cfs_rq = tg->cfs_rq[cpu]; - - raw_spin_lock_irqsave(&rq->lock, flags); - - update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); + struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + return; - raw_spin_unlock_irqrestore(&rq->lock, flags); + update_cfs_rq_blocked_load(cfs_rq, 1); + if (se) + update_entity_load_avg(se, 1); + else + update_rq_runnable_avg(rq_of(cfs_rq), 1); - return 0; + if (se) { + /* + * We can pivot on the runnable average decaying to zero for + * list removal since the parent average will always be >= + * child. + */ + if (se->avg.runnable_avg_sum) + update_cfs_shares(cfs_rq); + else + list_del_leaf_cfs_rq(cfs_rq); + } } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu) { - struct cfs_rq *cfs_rq; struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + + unsigned long flags; + int num_updates = 0; rcu_read_lock(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; + __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); - update_shares_cpu(cfs_rq->tg, cpu); + /* + * Periodically release the lock so that a cfs_rq with many + * children cannot hold it for an arbitrary period of time. + */ + if (num_updates++ % 20 == 0) { + raw_spin_unlock_irqrestore(&rq->lock, flags); + cpu_relax(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + } } + + raw_spin_unlock_irqrestore(&rq->lock, flags); rcu_read_unlock(); } @@ -3389,7 +3909,7 @@ static unsigned long task_h_load(struct task_struct *p) return load; } #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu) { } @@ -4409,12 +4929,14 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; + update_rq_runnable_avg(this_rq, 1); + /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ raw_spin_unlock(&this_rq->lock); - update_shares(this_cpu); + update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4674,7 +5196,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; - update_shares(cpu); + update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { @@ -4842,6 +5364,223 @@ need_kick: static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif +#ifdef CONFIG_SCHED_HMP +/* Check if task should migrate to a faster core */ +static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + if (p->prio < hmp_up_prio && p->prio > 100 + && hmp_cpu_is_slow(cpu) + && se->avg.load_avg_ratio > hmp_up_threshold) { + return 1; + } + return 0; +} + +/* Check if task should migrate to a slower core */ +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + if (p->prio >= hmp_up_prio || (hmp_cpu_is_fast(cpu) + && se->avg.load_avg_ratio < hmp_down_threshold)) { + return 1; + } + return 0; +} + +/* + * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + * Ideally this function should be merged with can_migrate_task() to avoid + * redundant code. + */ +static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + int tsk_cache_hot = 0; + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed + */ + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + return 0; + } + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + + return 1; +} + +/* + * move_specific_task tries to move a specific task. + * Returns 1 if successful and 0 otherwise. + * Called with both runqueues locked. + */ +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + struct task_struct *p, *n; + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + continue; + + if (!hmp_can_migrate_task(p, env)) + continue; + /* Check if we found the right task */ + if (p != pm) + continue; + + move_task(p, env); + /* + * Right now, this is only the third place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; + } + return 0; +} + +/* + * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to + * migrate a specific task from one runqueue to another. + * hmp_force_up_migration uses this to push a currently running task + * off a runqueue. + * Based on active_load_balance_stop_cpu and can potentially be merged. + */ +static int hmp_active_task_migration_cpu_stop(void *data) +{ + struct rq *busiest_rq = data; + struct task_struct *p = busiest_rq->migrate_task; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct sched_domain *sd; + + raw_spin_lock_irq(&busiest_rq->lock); + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) { + goto out_unlock; + } + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + goto out_unlock; + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, sd) { + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_specific_task(&env, p)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + rcu_read_unlock(); + double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; +} + +static DEFINE_SPINLOCK(hmp_force_migration); + +/* hmp_force_up_migration checks runqueues for tasks that need to + * be actively migrated to a faster cpu. */ +static void hmp_force_up_migration(int this_cpu) +{ + int i; + struct sched_entity *curr; + struct rq *target; + unsigned long flags; + unsigned int force; + struct task_struct *p; + + if (!spin_trylock(&hmp_force_migration)) + return; + for_each_cpu(i, &hmp_slow_cpu_mask) { + force = 0; + target = cpu_rq(i); + raw_spin_lock_irqsave(&target->lock, flags); + curr = target->cfs.curr; + if (!curr || !entity_is_task(curr)) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + p = task_of(curr); + if (hmp_up_migration(i, curr)) { + if (!target->active_balance) { + target->active_balance = 1; + target->push_cpu = hmp_select_fast_cpu(p); + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, 1); + } + } + raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + spin_unlock(&hmp_force_migration); +} +#else +static void hmp_force_up_migration(int this_cpu) { } +#endif /* CONFIG_SCHED_HMP */ + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). @@ -4853,6 +5592,8 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; + hmp_force_up_migration(this_cpu); + rebalance_domains(this_cpu, idle); /* @@ -4907,6 +5648,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + update_rq_runnable_avg(rq, 1); } /* @@ -4999,6 +5742,21 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + p->se.avg.decay_count = 0; + } +#endif } /* @@ -5045,11 +5803,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + atomic64_set(&cfs_rq->decay_counter, 1); + atomic64_set(&cfs_rq->removed_load, 0); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct cfs_rq *cfs_rq; /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5081,8 +5844,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + if (!on_rq) { + cfs_rq = cfs_rq_of(&p->se); + p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP + /* + * set_task_rq will() have removed our previous contribution, + * but we must synchronize explicitly against further decay + * here. + */ + p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif + } } void free_fair_sched_group(struct task_group *tg) @@ -5167,10 +5941,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->tg = tg; cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; @@ -5217,8 +5987,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) + for_each_sched_entity(se) { update_cfs_shares(group_cfs_rq(se)); + /* update contribution to parent */ + update_entity_load_avg(se, 1); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -5272,7 +6045,9 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, - +#ifdef CONFIG_FAIR_GROUP_SCHED + .migrate_task_rq = migrate_task_rq_fair, +#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55844f24435..0dc9bd108f5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -112,6 +112,8 @@ struct task_group { unsigned long shares; atomic_t load_weight; + atomic64_t load_avg; + atomic_t runnable_avg, usage_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -222,22 +224,29 @@ struct cfs_rq { unsigned int nr_spread_over; #endif +#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ #ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter, removed_load; + u64 last_decay; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED + u32 tg_runnable_contrib, tg_usage_contrib; + u64 tg_load_contrib; +#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * h_load = weight * f(tg) * @@ -245,26 +254,30 @@ struct cfs_rq { * this group. */ unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* - * Maintaining per-cpu shares distribution for group scheduling + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ - unsigned long load_contribution; -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; - u64 throttled_timestamp; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ @@ -408,6 +421,7 @@ struct rq { int active_balance; int push_cpu; struct cpu_stop_work active_balance_work; + struct task_struct *migrate_task; /* cpu of this runqueue: */ int cpu; int online; @@ -463,6 +477,8 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + + struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 98f60c5caa1..0a49ee70054 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,14 +1,22 @@ /* * Common SMP CPU bringup/teardown functions */ +#include <linux/cpu.h> #include <linux/err.h> #include <linux/smp.h> #include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> #include <linux/sched.h> +#include <linux/export.h> #include <linux/percpu.h> +#include <linux/kthread.h> +#include <linux/smpboot.h> #include "smpboot.h" +#ifdef CONFIG_SMP + #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -65,3 +73,228 @@ void __init idle_threads_init(void) } } #endif + +#endif /* #ifdef CONFIG_SMP */ + +static LIST_HEAD(hotplug_threads); +static DEFINE_MUTEX(smpboot_threads_lock); + +struct smpboot_thread_data { + unsigned int cpu; + unsigned int status; + struct smp_hotplug_thread *ht; +}; + +enum { + HP_THREAD_NONE = 0, + HP_THREAD_ACTIVE, + HP_THREAD_PARKED, +}; + +/** + * smpboot_thread_fn - percpu hotplug thread loop function + * @void: thread data pointer + * + * Checks for thread stop and park conditions. Calls the necessary + * setup, cleanup, park and unpark functions for the registered + * thread. + * + * Returns 1 when the thread should exit, 0 otherwise. + */ +static int smpboot_thread_fn(void *data) +{ + struct smpboot_thread_data *td = data; + struct smp_hotplug_thread *ht = td->ht; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + preempt_disable(); + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + preempt_enable(); + if (ht->cleanup) + ht->cleanup(td->cpu, cpu_online(td->cpu)); + kfree(td); + return 0; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + preempt_enable(); + if (ht->park && td->status == HP_THREAD_ACTIVE) { + BUG_ON(td->cpu != smp_processor_id()); + ht->park(td->cpu); + td->status = HP_THREAD_PARKED; + } + kthread_parkme(); + /* We might have been woken for stop */ + continue; + } + + BUG_ON(td->cpu != smp_processor_id()); + + /* Check for state change setup */ + switch (td->status) { + case HP_THREAD_NONE: + preempt_enable(); + if (ht->setup) + ht->setup(td->cpu); + td->status = HP_THREAD_ACTIVE; + preempt_disable(); + break; + case HP_THREAD_PARKED: + preempt_enable(); + if (ht->unpark) + ht->unpark(td->cpu); + td->status = HP_THREAD_ACTIVE; + preempt_disable(); + break; + } + + if (!ht->thread_should_run(td->cpu)) { + preempt_enable(); + schedule(); + } else { + set_current_state(TASK_RUNNING); + preempt_enable(); + ht->thread_fn(td->cpu); + } + } +} + +static int +__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + struct smpboot_thread_data *td; + + if (tsk) + return 0; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); + if (!td) + return -ENOMEM; + td->cpu = cpu; + td->ht = ht; + + tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, + ht->thread_comm); + if (IS_ERR(tsk)) { + kfree(td); + return PTR_ERR(tsk); + } + + get_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = tsk; + return 0; +} + +int smpboot_create_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) { + ret = __smpboot_create_thread(cur, cpu); + if (ret) + break; + } + mutex_unlock(&smpboot_threads_lock); + return ret; +} + +static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + kthread_unpark(tsk); +} + +void smpboot_unpark_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) + smpboot_unpark_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) + kthread_park(tsk); +} + +void smpboot_park_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry_reverse(cur, &hotplug_threads, list) + smpboot_park_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) +{ + unsigned int cpu; + + /* We need to destroy also the parked threads of offline cpus */ + for_each_possible_cpu(cpu) { + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) { + kthread_stop(tsk); + put_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = NULL; + } + } +} + +/** + * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Creates and starts the threads on all online cpus. + */ +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + unsigned int cpu; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + for_each_online_cpu(cpu) { + ret = __smpboot_create_thread(plug_thread, cpu); + if (ret) { + smpboot_destroy_threads(plug_thread); + goto out; + } + smpboot_unpark_thread(plug_thread, cpu); + } + list_add(&plug_thread->list, &hotplug_threads); +out: + mutex_unlock(&smpboot_threads_lock); + return ret; +} +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); + +/** + * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Stops all threads on all possible cpus. + */ +void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + list_del(&plug_thread->list); + smpboot_destroy_threads(plug_thread); + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 80c0acfb847..9e26228edf6 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -15,4 +15,8 @@ static inline void idle_thread_set_boot_cpu(void) { } static inline void idle_threads_init(void) { } #endif +int smpboot_create_threads(unsigned int cpu); +void smpboot_park_threads(unsigned int cpu); +void smpboot_unpark_threads(unsigned int cpu); + #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 671f9594e36..090e1b92053 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/ftrace.h> #include <linux/smp.h> +#include <linux/smpboot.h> #include <linux/tick.h> #define CREATE_TRACE_POINTS @@ -733,49 +734,22 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int run_ksoftirqd(void * __bind_cpu) +static int ksoftirqd_should_run(unsigned int cpu) { - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { - preempt_disable(); - if (!local_softirq_pending()) { - schedule_preempt_disabled(); - } - - __set_current_state(TASK_RUNNING); - - while (local_softirq_pending()) { - /* Preempt disable stops cpu going offline. - If already offline, we'll be on wrong CPU: - don't process */ - if (cpu_is_offline((long)__bind_cpu)) - goto wait_to_die; - local_irq_disable(); - if (local_softirq_pending()) - __do_softirq(); - local_irq_enable(); - sched_preempt_enable_no_resched(); - cond_resched(); - preempt_disable(); - rcu_note_context_switch((long)__bind_cpu); - } - preempt_enable(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; + return local_softirq_pending(); +} -wait_to_die: - preempt_enable(); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); +static void run_ksoftirqd(unsigned int cpu) +{ + local_irq_disable(); + if (local_softirq_pending()) { + __do_softirq(); + rcu_note_context_switch(cpu); + local_irq_enable(); + cond_resched(); + return; } - __set_current_state(TASK_RUNNING); - return 0; + local_irq_enable(); } #ifdef CONFIG_HOTPLUG_CPU @@ -841,50 +815,17 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create_on_node(run_ksoftirqd, - hcpu, - cpu_to_node(hotcpu), - "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return notifier_from_errno(PTR_ERR(p)); - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); - break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { - static const struct sched_param param = { - .sched_priority = MAX_RT_PRIO-1 - }; - - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); + int hotcpu = (unsigned long)hcpu; + takeover_tasklets(hotcpu); break; } #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -892,14 +833,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; +static struct smp_hotplug_thread softirq_threads = { + .store = &ksoftirqd, + .thread_should_run = ksoftirqd_should_run, + .thread_fn = run_ksoftirqd, + .thread_comm = "ksoftirqd/%u", +}; + static __init int spawn_ksoftirqd(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - - BUG_ON(err != NOTIFY_OK); - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); + + BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); + return 0; } early_initcall(spawn_ksoftirqd); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b1dfba70f7..9d4c8d5a1f5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -22,6 +22,7 @@ #include <linux/notifier.h> #include <linux/module.h> #include <linux/sysctl.h> +#include <linux/smpboot.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -29,16 +30,18 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_disabled; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); #ifdef CONFIG_HARDLOCKUP_DETECTOR static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif @@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event, __this_cpu_write(hard_watchdog_warn, false); return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -#else -static inline void watchdog_interrupt_count(void) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +static int watchdog_nmi_enable(unsigned int cpu); +static void watchdog_nmi_disable(unsigned int cpu); /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) @@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } +static void watchdog_set_prio(unsigned int policy, unsigned int prio) +{ + struct sched_param param = { .sched_priority = prio }; -/* - * The watchdog thread - touches the timestamp. - */ -static int watchdog(void *unused) + sched_setscheduler(current, policy, ¶m); +} + +static void watchdog_enable(unsigned int cpu) { - struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - /* initialize timestamp */ - __touch_watchdog(); + if (!watchdog_enabled) { + kthread_park(current); + return; + } + + /* Enable the perf event */ + watchdog_nmi_enable(cpu); /* kick off the timer for the hardlockup detector */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + /* done here because hrtimer_start can only pin to smp_processor_id() */ hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), HRTIMER_MODE_REL_PINNED); - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly (kicked by the hrtimer callback function) once every - * get_sample_period() seconds (4 seconds by default) to reset the - * softlockup timestamp. If this gets delayed for more than - * 2*watchdog_thresh seconds then the debug-printout triggers in - * watchdog_timer_fn(). - */ - while (!kthread_should_stop()) { - __touch_watchdog(); - schedule(); + /* initialize timestamp */ + watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); + __touch_watchdog(); +} - if (kthread_should_stop()) - break; +static void watchdog_disable(unsigned int cpu) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - set_current_state(TASK_INTERRUPTIBLE); - } - /* - * Drop the policy/priority elevation during thread exit to avoid a - * scheduling latency spike. - */ - __set_current_state(TASK_RUNNING); - sched_setscheduler(current, SCHED_NORMAL, ¶m); - return 0; + watchdog_set_prio(SCHED_NORMAL, 0); + hrtimer_cancel(hrtimer); + /* disable the perf event */ + watchdog_nmi_disable(cpu); } +static int watchdog_should_run(unsigned int cpu) +{ + return __this_cpu_read(hrtimer_interrupts) != + __this_cpu_read(soft_lockup_hrtimer_cnt); +} + +/* + * The watchdog thread function - touches the timestamp. + * + * It only runs once every get_sample_period() seconds (4 seconds by + * default) to reset the softlockup timestamp. If this gets delayed + * for more than 2*watchdog_thresh seconds then the debug-printout + * triggers in watchdog_timer_fn(). + */ +static void watchdog(unsigned int cpu) +{ + __this_cpu_write(soft_lockup_hrtimer_cnt, + __this_cpu_read(hrtimer_interrupts)); + __touch_watchdog(); +} #ifdef CONFIG_HARDLOCKUP_DETECTOR /* @@ -379,7 +403,7 @@ static int watchdog(void *unused) */ static unsigned long cpu0_err; -static int watchdog_nmi_enable(int cpu) +static int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -433,7 +457,7 @@ out: return 0; } -static void watchdog_nmi_disable(int cpu) +static void watchdog_nmi_disable(unsigned int cpu) { struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu) return; } #else -static int watchdog_nmi_enable(int cpu) { return 0; } -static void watchdog_nmi_disable(int cpu) { return; } +static int watchdog_nmi_enable(unsigned int cpu) { return 0; } +static void watchdog_nmi_disable(unsigned int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* prepare/enable/disable routines */ -static void watchdog_prepare_cpu(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - WARN_ON(per_cpu(softlockup_watchdog, cpu)); - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; -} - -static int watchdog_enable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err = 0; - - /* enable the perf event */ - err = watchdog_nmi_enable(cpu); - - /* Regardless of err above, fall through and start softlockup */ - - /* create the watchdog thread */ - if (!p) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); - if (IS_ERR(p)) { - pr_err("softlockup watchdog for %i failed\n", cpu); - if (!err) { - /* if hardlockup hasn't already set this */ - err = PTR_ERR(p); - /* and disable the perf event */ - watchdog_nmi_disable(cpu); - } - goto out; - } - sched_setscheduler(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - per_cpu(watchdog_touch_ts, cpu) = 0; - per_cpu(softlockup_watchdog, cpu) = p; - wake_up_process(p); - } - -out: - return err; -} - -static void watchdog_disable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - /* - * cancel the timer first to stop incrementing the stats - * and waking up the kthread - */ - hrtimer_cancel(hrtimer); - - /* disable the perf event */ - watchdog_nmi_disable(cpu); - - /* stop the watchdog thread */ - if (p) { - per_cpu(softlockup_watchdog, cpu) = NULL; - kthread_stop(p); - } -} - /* sysctl functions */ #ifdef CONFIG_SYSCTL static void watchdog_enable_all_cpus(void) { - int cpu; - - watchdog_enabled = 0; - - for_each_online_cpu(cpu) - if (!watchdog_enable(cpu)) - /* if any cpu succeeds, watchdog is considered - enabled for the system */ - watchdog_enabled = 1; - - if (!watchdog_enabled) - pr_err("failed to be enabled on some cpus\n"); + unsigned int cpu; + if (watchdog_disabled) { + watchdog_disabled = 0; + for_each_online_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + } } static void watchdog_disable_all_cpus(void) { - int cpu; - - for_each_online_cpu(cpu) - watchdog_disable(cpu); + unsigned int cpu; - /* if all watchdogs are disabled, then they are disabled for the system */ - watchdog_enabled = 0; + if (!watchdog_disabled) { + watchdog_disabled = 1; + for_each_online_cpu(cpu) + kthread_park(per_cpu(softlockup_watchdog, cpu)); + } } - /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ @@ -557,73 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write, { int ret; + if (watchdog_disabled < 0) + return -ENODEV; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) - goto out; + return ret; if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); -out: return ret; } #endif /* CONFIG_SYSCTL */ - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - watchdog_prepare_cpu(hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (watchdog_enabled) - watchdog_enable(hotcpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - watchdog_disable(hotcpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - watchdog_disable(hotcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - - /* - * hardlockup and softlockup are not important enough - * to block cpu bring up. Just always succeed and - * rely on printk output to flag problems. - */ - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback +static struct smp_hotplug_thread watchdog_threads = { + .store = &softlockup_watchdog, + .thread_should_run = watchdog_should_run, + .thread_fn = watchdog, + .thread_comm = "watchdog/%u", + .setup = watchdog_enable, + .park = watchdog_disable, + .unpark = watchdog_enable, }; void __init lockup_detector_init(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(notifier_to_errno(err)); - - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - return; + if (smpboot_register_percpu_thread(&watchdog_threads)) { + pr_err("Failed to create watchdog threads, disabled\n"); + watchdog_disabled = -ENODEV; + } } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a3128dc67d..692d97628a1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -45,32 +45,41 @@ #include "workqueue_sched.h" enum { - /* global_cwq flags */ - GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ - GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - GCWQ_FREEZING = 1 << 3, /* freeze in progress */ - GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + /* + * global_cwq flags + * + * A bound gcwq is either associated or disassociated with its CPU. + * While associated (!DISASSOCIATED), all workers are bound to the + * CPU and none has %WORKER_UNBOUND set and concurrency management + * is in effect. + * + * While DISASSOCIATED, the cpu may be offline and all workers have + * %WORKER_UNBOUND set and concurrency management disabled, and may + * be executing on any CPU. The gcwq behaves as an unbound one. + * + * Note that DISASSOCIATED can be flipped only while holding + * managership of all pools on the gcwq to avoid changing binding + * state while create_worker() is in progress. + */ + GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 1, /* freeze in progress */ + + /* pool flags */ + POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ WORKER_REBIND = 1 << 5, /* mom is home, come back */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | - WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | + WORKER_CPU_INTENSIVE, - /* gcwq->trustee_state */ - TRUSTEE_START = 0, /* start */ - TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ - TRUSTEE_BUTCHER = 2, /* butcher workers */ - TRUSTEE_RELEASE = 3, /* release workers */ - TRUSTEE_DONE = 4, /* trustee is done */ + NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, @@ -84,13 +93,13 @@ enum { (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ - TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ /* * Rescue workers are used only on emergencies and shared by * all cpus. Give -20. */ RESCUER_NICE_LEVEL = -20, + HIGHPRI_NICE_LEVEL = -20, }; /* @@ -115,6 +124,8 @@ enum { */ struct global_cwq; +struct worker_pool; +struct idle_rebind; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -131,12 +142,31 @@ struct worker { struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ - struct global_cwq *gcwq; /* I: the associated gcwq */ + struct worker_pool *pool; /* I: the associated pool */ /* 64 bytes boundary on 64bit, 32 on 32bit */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - struct work_struct rebind_work; /* L: rebind worker to cpu */ + + /* for rebinding worker to CPU */ + struct idle_rebind *idle_rebind; /* L: for idle worker */ + struct work_struct rebind_work; /* L: for busy worker */ +}; + +struct worker_pool { + struct global_cwq *gcwq; /* I: the owning gcwq */ + unsigned int flags; /* X: flags */ + + struct list_head worklist; /* L: list of pending works */ + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + struct list_head idle_list; /* X: list of idle workers */ + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for workers */ + + struct mutex manager_mutex; /* mutex manager should hold */ + struct ida worker_ida; /* L: for worker IDs */ }; /* @@ -146,27 +176,16 @@ struct worker { */ struct global_cwq { spinlock_t lock; /* the gcwq lock */ - struct list_head worklist; /* L: list of pending works */ unsigned int cpu; /* I: the associated cpu */ unsigned int flags; /* L: GCWQ_* flags */ - int nr_workers; /* L: total number of workers */ - int nr_idle; /* L: currently idle ones */ - - /* workers are chained either in the idle_list or busy_hash */ - struct list_head idle_list; /* X: list of idle workers */ + /* workers are chained either in busy_hash or pool idle_list */ struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct timer_list idle_timer; /* L: worker idle timeout */ - struct timer_list mayday_timer; /* L: SOS timer for dworkers */ - - struct ida worker_ida; /* L: for worker IDs */ + struct worker_pool pools[2]; /* normal and highpri pools */ - struct task_struct *trustee; /* L: for gcwq shutdown */ - unsigned int trustee_state; /* L: trustee state */ - wait_queue_head_t trustee_wait; /* trustee wait */ - struct worker *first_idle; /* L: first idle worker */ + wait_queue_head_t rebind_hold; /* rebind hold wait */ } ____cacheline_aligned_in_smp; /* @@ -175,7 +194,7 @@ struct global_cwq { * aligned at two's power of the number of flag bits. */ struct cpu_workqueue_struct { - struct global_cwq *gcwq; /* I: the associated gcwq */ + struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ @@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> +#define for_each_worker_pool(pool, gcwq) \ + for ((pool) = &(gcwq)->pools[0]; \ + (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) + #define for_each_busy_worker(worker, i, pos, gcwq) \ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) @@ -444,7 +467,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ * try_to_wake_up(). Put it in a separate cacheline. */ static DEFINE_PER_CPU(struct global_cwq, global_cwq); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); /* * Global cpu workqueue and nr_running counter for unbound gcwq. The @@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); * workers have WORKER_UNBOUND set. */ static struct global_cwq unbound_global_cwq; -static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ +static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { + [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ +}; static int worker_thread(void *__worker); +static int worker_pool_pri(struct worker_pool *pool) +{ + return pool - pool->gcwq->pools; +} + static struct global_cwq *get_gcwq(unsigned int cpu) { if (cpu != WORK_CPU_UNBOUND) @@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu) return &unbound_global_cwq; } -static atomic_t *get_gcwq_nr_running(unsigned int cpu) +static atomic_t *get_pool_nr_running(struct worker_pool *pool) { + int cpu = pool->gcwq->cpu; + int idx = worker_pool_pri(pool); + if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(gcwq_nr_running, cpu); + return &per_cpu(pool_nr_running, cpu)[idx]; else - return &unbound_gcwq_nr_running; + return &unbound_pool_nr_running[idx]; } static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, @@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) if (data & WORK_STRUCT_CWQ) return ((struct cpu_workqueue_struct *) - (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; cpu = data >> WORK_STRUCT_FLAG_BITS; if (cpu == WORK_CPU_NONE) @@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) } /* - * Policy functions. These define the policies on how the global - * worker pool is managed. Unless noted otherwise, these functions - * assume that they're being called with gcwq->lock held. + * Policy functions. These define the policies on how the global worker + * pools are managed. Unless noted otherwise, these functions assume that + * they're being called with gcwq->lock held. */ -static bool __need_more_worker(struct global_cwq *gcwq) +static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || - gcwq->flags & GCWQ_HIGHPRI_PENDING; + return !atomic_read(get_pool_nr_running(pool)); } /* * Need to wake up a worker? Called from anything but currently * running workers. + * + * Note that, because unbound workers never contribute to nr_running, this + * function will always return %true for unbound gcwq as long as the + * worklist isn't empty. */ -static bool need_more_worker(struct global_cwq *gcwq) +static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); + return !list_empty(&pool->worklist) && __need_more_worker(pool); } /* Can I start working? Called from busy but !running workers. */ -static bool may_start_working(struct global_cwq *gcwq) +static bool may_start_working(struct worker_pool *pool) { - return gcwq->nr_idle; + return pool->nr_idle; } /* Do I need to keep working? Called from currently running workers. */ -static bool keep_working(struct global_cwq *gcwq) +static bool keep_working(struct worker_pool *pool) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + atomic_t *nr_running = get_pool_nr_running(pool); - return !list_empty(&gcwq->worklist) && - (atomic_read(nr_running) <= 1 || - gcwq->flags & GCWQ_HIGHPRI_PENDING); + return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; } /* Do we need a new worker? Called from manager. */ -static bool need_to_create_worker(struct global_cwq *gcwq) +static bool need_to_create_worker(struct worker_pool *pool) { - return need_more_worker(gcwq) && !may_start_working(gcwq); + return need_more_worker(pool) && !may_start_working(pool); } /* Do I need to be the manager? */ -static bool need_to_manage_workers(struct global_cwq *gcwq) +static bool need_to_manage_workers(struct worker_pool *pool) { - return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; + return need_to_create_worker(pool) || + (pool->flags & POOL_MANAGE_WORKERS); } /* Do we have too many workers and should some go away? */ -static bool too_many_workers(struct global_cwq *gcwq) +static bool too_many_workers(struct worker_pool *pool) { - bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; - int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ - int nr_busy = gcwq->nr_workers - nr_idle; + bool managing = mutex_is_locked(&pool->manager_mutex); + int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ + int nr_busy = pool->nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq) */ /* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct global_cwq *gcwq) +static struct worker *first_worker(struct worker_pool *pool) { - if (unlikely(list_empty(&gcwq->idle_list))) + if (unlikely(list_empty(&pool->idle_list))) return NULL; - return list_first_entry(&gcwq->idle_list, struct worker, entry); + return list_first_entry(&pool->idle_list, struct worker, entry); } /** * wake_up_worker - wake up an idle worker - * @gcwq: gcwq to wake worker for + * @pool: worker pool to wake worker from * - * Wake up the first idle worker of @gcwq. + * Wake up the first idle worker of @pool. * * CONTEXT: * spin_lock_irq(gcwq->lock). */ -static void wake_up_worker(struct global_cwq *gcwq) +static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_worker(gcwq); + struct worker *worker = first_worker(pool); if (likely(worker)) wake_up_process(worker->task); @@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) struct worker *worker = kthread_data(task); if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(cpu)); + atomic_inc(get_pool_nr_running(worker->pool)); } /** @@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct global_cwq *gcwq = get_gcwq(cpu); - atomic_t *nr_running = get_gcwq_nr_running(cpu); + struct worker_pool *pool = worker->pool; + atomic_t *nr_running = get_pool_nr_running(pool); if (worker->flags & WORKER_NOT_RUNNING) return NULL; @@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * worklist not empty test sequence is in insert_work(). * Please read comment there. * - * NOT_RUNNING is clear. This means that trustee is not in - * charge and we're running on the local cpu w/ rq lock held - * and preemption disabled, which in turn means that none else - * could be manipulating idle_list, so dereferencing idle_list - * without gcwq lock is safe. + * NOT_RUNNING is clear. This means that we're bound to and + * running on the local cpu w/ rq lock held and preemption + * disabled, which in turn means that none else could be + * manipulating idle_list, so dereferencing idle_list without gcwq + * lock is safe. */ - if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) - to_wakeup = first_worker(gcwq); + if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) + to_wakeup = first_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, static inline void worker_set_flags(struct worker *worker, unsigned int flags, bool wakeup) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; WARN_ON_ONCE(worker->task != current); @@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + atomic_t *nr_running = get_pool_nr_running(pool); if (wakeup) { if (atomic_dec_and_test(nr_running) && - !list_empty(&gcwq->worklist)) - wake_up_worker(gcwq); + !list_empty(&pool->worklist)) + wake_up_worker(pool); } else atomic_dec(nr_running); } @@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; WARN_ON_ONCE(worker->task != current); @@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(gcwq->cpu)); + atomic_inc(get_pool_nr_running(pool)); } /** @@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, } /** - * gcwq_determine_ins_pos - find insertion position - * @gcwq: gcwq of interest - * @cwq: cwq a work is being queued for - * - * A work for @cwq is about to be queued on @gcwq, determine insertion - * position for the work. If @cwq is for HIGHPRI wq, the work is - * queued at the head of the queue but in FIFO order with respect to - * other HIGHPRI works; otherwise, at the end of the queue. This - * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that - * there are HIGHPRI works pending. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to inserstion position. - */ -static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, - struct cpu_workqueue_struct *cwq) -{ - struct work_struct *twork; - - if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) - return &gcwq->worklist; - - list_for_each_entry(twork, &gcwq->worklist, entry) { - struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); - - if (!(tcwq->wq->flags & WQ_HIGHPRI)) - break; - } - - gcwq->flags |= GCWQ_HIGHPRI_PENDING; - return &twork->entry; -} - -/** * insert_work - insert a work into gcwq * @cwq: cwq @work belongs to * @work: work to insert @@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = cwq->pool; /* we own @work, set data and link */ set_work_cwq(work, cwq, extra_flags); @@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq, */ smp_mb(); - if (__need_more_worker(gcwq)) - wake_up_worker(gcwq); + if (__need_more_worker(pool)) + wake_up_worker(pool); } /* @@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (likely(cwq->nr_active < cwq->max_active)) { trace_workqueue_activate_work(work); cwq->nr_active++; - worklist = gcwq_determine_ins_pos(gcwq, cwq); + worklist = &cwq->pool->worklist; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &cwq->delayed_works; @@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); */ static void worker_enter_idle(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; BUG_ON(worker->flags & WORKER_IDLE); BUG_ON(!list_empty(&worker->entry) && @@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker) /* can't use worker_set_flags(), also called from start_worker() */ worker->flags |= WORKER_IDLE; - gcwq->nr_idle++; + pool->nr_idle++; worker->last_active = jiffies; /* idle_list is LIFO */ - list_add(&worker->entry, &gcwq->idle_list); + list_add(&worker->entry, &pool->idle_list); - if (likely(!(worker->flags & WORKER_ROGUE))) { - if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) - mod_timer(&gcwq->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } else - wake_up_all(&gcwq->trustee_wait); + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because trustee releases gcwq->lock - * between setting %WORKER_ROGUE and zapping nr_running, the - * warning may trigger spuriously. Check iff trustee is idle. + * Sanity check nr_running. Because gcwq_unbind_fn() releases + * gcwq->lock between setting %WORKER_UNBOUND and zapping + * nr_running, the warning may trigger spuriously. Check iff + * unbind is not in progress. */ - WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && - gcwq->nr_workers == gcwq->nr_idle && - atomic_read(get_gcwq_nr_running(gcwq->cpu))); + WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && + pool->nr_workers == pool->nr_idle && + atomic_read(get_pool_nr_running(pool))); } /** @@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker) */ static void worker_leave_idle(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; BUG_ON(!(worker->flags & WORKER_IDLE)); worker_clr_flags(worker, WORKER_IDLE); - gcwq->nr_idle--; + pool->nr_idle--; list_del_init(&worker->entry); } @@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker) * verbatim as it's best effort and blocking and gcwq may be * [dis]associated in the meantime. * - * This function tries set_cpus_allowed() and locks gcwq and verifies - * the binding against GCWQ_DISASSOCIATED which is set during - * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters - * idle state or fetches works without dropping lock, it can guarantee - * the scheduling requirement described in the first paragraph. + * This function tries set_cpus_allowed() and locks gcwq and verifies the + * binding against %GCWQ_DISASSOCIATED which is set during + * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker + * enters idle state or fetches works without dropping lock, it can + * guarantee the scheduling requirement described in the first paragraph. * * CONTEXT: * Might sleep. Called without any lock but returns with gcwq->lock @@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker) static bool worker_maybe_bind_and_lock(struct worker *worker) __acquires(&gcwq->lock) { - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; struct task_struct *task = worker->task; while (true) { @@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock) } } +struct idle_rebind { + int cnt; /* # workers to be rebound */ + struct completion done; /* all workers rebound */ +}; + +/* + * Rebind an idle @worker to its CPU. During CPU onlining, this has to + * happen synchronously for idle workers. worker_thread() will test + * %WORKER_REBIND before leaving idle and call this function. + */ +static void idle_worker_rebind(struct worker *worker) +{ + struct global_cwq *gcwq = worker->pool->gcwq; + + /* CPU must be online at this point */ + WARN_ON(!worker_maybe_bind_and_lock(worker)); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); + + /* we did our part, wait for rebind_workers() to finish up */ + wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); +} + /* - * Function for worker->rebind_work used to rebind rogue busy workers - * to the associated cpu which is coming back online. This is - * scheduled by cpu up but can race with other cpu hotplug operations - * and may be executed twice without intervening cpu down. + * Function for @worker->rebind.work used to rebind unbound busy workers to + * the associated cpu which is coming back online. This is scheduled by + * cpu up but can race with other cpu hotplug operations and may be + * executed twice without intervening cpu down. */ -static void worker_rebind_fn(struct work_struct *work) +static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; if (worker_maybe_bind_and_lock(worker)) worker_clr_flags(worker, WORKER_REBIND); @@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work) spin_unlock_irq(&gcwq->lock); } +/** + * rebind_workers - rebind all workers of a gcwq to the associated CPU + * @gcwq: gcwq of interest + * + * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding + * is different for idle and busy ones. + * + * The idle ones should be rebound synchronously and idle rebinding should + * be complete before any worker starts executing work items with + * concurrency management enabled; otherwise, scheduler may oops trying to + * wake up non-local idle worker from wq_worker_sleeping(). + * + * This is achieved by repeatedly requesting rebinding until all idle + * workers are known to have been rebound under @gcwq->lock and holding all + * idle workers from becoming busy until idle rebinding is complete. + * + * Once idle workers are rebound, busy workers can be rebound as they + * finish executing their current work items. Queueing the rebind work at + * the head of their scheduled lists is enough. Note that nr_running will + * be properbly bumped as busy workers rebind. + * + * On return, all workers are guaranteed to either be bound or have rebind + * work item scheduled. + */ +static void rebind_workers(struct global_cwq *gcwq) + __releases(&gcwq->lock) __acquires(&gcwq->lock) +{ + struct idle_rebind idle_rebind; + struct worker_pool *pool; + struct worker *worker; + struct hlist_node *pos; + int i; + + lockdep_assert_held(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) + lockdep_assert_held(&pool->manager_mutex); + + /* + * Rebind idle workers. Interlocked both ways. We wait for + * workers to rebind via @idle_rebind.done. Workers will wait for + * us to finish up by watching %WORKER_REBIND. + */ + init_completion(&idle_rebind.done); +retry: + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + /* set REBIND and kick idle ones, we'll wait for these later */ + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { + if (worker->flags & WORKER_REBIND) + continue; + + /* morph UNBOUND to REBIND */ + worker->flags &= ~WORKER_UNBOUND; + worker->flags |= WORKER_REBIND; + + idle_rebind.cnt++; + worker->idle_rebind = &idle_rebind; + + /* worker_thread() will call idle_worker_rebind() */ + wake_up_process(worker->task); + } + } + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + /* busy ones might have become idle while waiting, retry */ + goto retry; + } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + */ + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags &= ~WORKER_REBIND; + + wake_up_all(&gcwq->rebind_hold); + + /* rebind busy workers */ + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* morph UNBOUND to REBIND */ + worker->flags &= ~WORKER_UNBOUND; + worker->flags |= WORKER_REBIND; + + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + /* wq doesn't matter, use the default one */ + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } +} + static struct worker *alloc_worker(void) { struct worker *worker; @@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, worker_rebind_fn); + INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void) /** * create_worker - create a new workqueue worker - * @gcwq: gcwq the new worker will belong to - * @bind: whether to set affinity to @cpu or not + * @pool: pool the new worker will belong to * - * Create a new worker which is bound to @gcwq. The returned worker + * Create a new worker which is bound to @pool. The returned worker * can be started by calling start_worker() or destroyed using * destroy_worker(). * @@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void) * RETURNS: * Pointer to the newly created worker. */ -static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +static struct worker *create_worker(struct worker_pool *pool) { - bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct global_cwq *gcwq = pool->gcwq; + const char *pri = worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; spin_lock_irq(&gcwq->lock); - while (ida_get_new(&gcwq->worker_ida, &id)) { + while (ida_get_new(&pool->worker_ida, &id)) { spin_unlock_irq(&gcwq->lock); - if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) goto fail; spin_lock_irq(&gcwq->lock); } @@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) if (!worker) goto fail; - worker->gcwq = gcwq; + worker->pool = pool; worker->id = id; - if (!on_unbound_cpu) + if (gcwq->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, - worker, - cpu_to_node(gcwq->cpu), - "kworker/%u:%d", gcwq->cpu, id); + worker, cpu_to_node(gcwq->cpu), + "kworker/%u:%d%s", gcwq->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d", id); + "kworker/u:%d%s", id, pri); if (IS_ERR(worker->task)) goto fail; + if (worker_pool_pri(pool)) + set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + /* - * A rogue worker will become a regular one if CPU comes - * online later on. Make sure every worker has - * PF_THREAD_BOUND set. + * Determine CPU binding of the new worker depending on + * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the + * flag remains stable across this function. See the comments + * above the flag definition for details. + * + * As an unbound worker may later become a regular one if CPU comes + * online, make sure every worker has %PF_THREAD_BOUND set. */ - if (bind && !on_unbound_cpu) + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { kthread_bind(worker->task, gcwq->cpu); - else { + } else { worker->task->flags |= PF_THREAD_BOUND; - if (on_unbound_cpu) - worker->flags |= WORKER_UNBOUND; + worker->flags |= WORKER_UNBOUND; } return worker; fail: if (id >= 0) { spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); + ida_remove(&pool->worker_ida, id); spin_unlock_irq(&gcwq->lock); } kfree(worker); @@ -1424,7 +1555,7 @@ fail: static void start_worker(struct worker *worker) { worker->flags |= WORKER_STARTED; - worker->gcwq->nr_workers++; + worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); } @@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker) */ static void destroy_worker(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; int id = worker->id; /* sanity check frenzy */ @@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker) BUG_ON(!list_empty(&worker->scheduled)); if (worker->flags & WORKER_STARTED) - gcwq->nr_workers--; + pool->nr_workers--; if (worker->flags & WORKER_IDLE) - gcwq->nr_idle--; + pool->nr_idle--; list_del_init(&worker->entry); worker->flags |= WORKER_DIE; @@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker) kfree(worker); spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); + ida_remove(&pool->worker_ida, id); } -static void idle_worker_timeout(unsigned long __gcwq) +static void idle_worker_timeout(unsigned long __pool) { - struct global_cwq *gcwq = (void *)__gcwq; + struct worker_pool *pool = (void *)__pool; + struct global_cwq *gcwq = pool->gcwq; spin_lock_irq(&gcwq->lock); - if (too_many_workers(gcwq)) { + if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) - mod_timer(&gcwq->idle_timer, expires); + mod_timer(&pool->idle_timer, expires); else { /* it's been idle for too long, wake up manager */ - gcwq->flags |= GCWQ_MANAGE_WORKERS; - wake_up_worker(gcwq); + pool->flags |= POOL_MANAGE_WORKERS; + wake_up_worker(pool); } } @@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work) return false; /* mayday mayday mayday */ - cpu = cwq->gcwq->cpu; + cpu = cwq->pool->gcwq->cpu; /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ if (cpu == WORK_CPU_UNBOUND) cpu = 0; @@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work) return true; } -static void gcwq_mayday_timeout(unsigned long __gcwq) +static void gcwq_mayday_timeout(unsigned long __pool) { - struct global_cwq *gcwq = (void *)__gcwq; + struct worker_pool *pool = (void *)__pool; + struct global_cwq *gcwq = pool->gcwq; struct work_struct *work; spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) { + if (need_to_create_worker(pool)) { /* * We've been trying to create a new worker but * haven't been successful. We might be hitting an * allocation deadlock. Send distress signals to * rescuers. */ - list_for_each_entry(work, &gcwq->worklist, entry) + list_for_each_entry(work, &pool->worklist, entry) send_mayday(work); } spin_unlock_irq(&gcwq->lock); - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); + mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } /** * maybe_create_worker - create a new worker if necessary - * @gcwq: gcwq to create a new worker for + * @pool: pool to create a new worker for * - * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * Create a new worker for @pool if necessary. @pool is guaranteed to * have at least one idle worker on return from this function. If * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is - * sent to all rescuers with works scheduled on @gcwq to resolve + * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * * On return, need_to_create_worker() is guaranteed to be false and @@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) * false if no action was taken and gcwq->lock stayed locked, true * otherwise. */ -static bool maybe_create_worker(struct global_cwq *gcwq) +static bool maybe_create_worker(struct worker_pool *pool) __releases(&gcwq->lock) __acquires(&gcwq->lock) { - if (!need_to_create_worker(gcwq)) + struct global_cwq *gcwq = pool->gcwq; + + if (!need_to_create_worker(pool)) return false; restart: spin_unlock_irq(&gcwq->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { struct worker *worker; - worker = create_worker(gcwq, true); + worker = create_worker(pool); if (worker) { - del_timer_sync(&gcwq->mayday_timer); + del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); start_worker(worker); - BUG_ON(need_to_create_worker(gcwq)); + BUG_ON(need_to_create_worker(pool)); return true; } - if (!need_to_create_worker(gcwq)) + if (!need_to_create_worker(pool)) break; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(CREATE_COOLDOWN); - if (!need_to_create_worker(gcwq)) + if (!need_to_create_worker(pool)) break; } - del_timer_sync(&gcwq->mayday_timer); + del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) + if (need_to_create_worker(pool)) goto restart; return true; } /** * maybe_destroy_worker - destroy workers which have been idle for a while - * @gcwq: gcwq to destroy workers for + * @pool: pool to destroy workers for * - * Destroy @gcwq workers which have been idle for longer than + * Destroy @pool workers which have been idle for longer than * IDLE_WORKER_TIMEOUT. * * LOCKING: @@ -1610,19 +1746,19 @@ restart: * false if no action was taken and gcwq->lock stayed locked, true * otherwise. */ -static bool maybe_destroy_workers(struct global_cwq *gcwq) +static bool maybe_destroy_workers(struct worker_pool *pool) { bool ret = false; - while (too_many_workers(gcwq)) { + while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { - mod_timer(&gcwq->idle_timer, expires); + mod_timer(&pool->idle_timer, expires); break; } @@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq) */ static bool manage_workers(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; bool ret = false; - if (gcwq->flags & GCWQ_MANAGING_WORKERS) + if (!mutex_trylock(&pool->manager_mutex)) return ret; - gcwq->flags &= ~GCWQ_MANAGE_WORKERS; - gcwq->flags |= GCWQ_MANAGING_WORKERS; + pool->flags &= ~POOL_MANAGE_WORKERS; /* * Destroy and then create so that may_start_working() is true * on return. */ - ret |= maybe_destroy_workers(gcwq); - ret |= maybe_create_worker(gcwq); - - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - - /* - * The trustee might be waiting to take over the manager - * position, tell it we're done. - */ - if (unlikely(gcwq->trustee)) - wake_up_all(&gcwq->trustee_wait); + ret |= maybe_destroy_workers(pool); + ret |= maybe_create_worker(pool); + mutex_unlock(&pool->manager_mutex); return ret; } @@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) { struct work_struct *work = list_first_entry(&cwq->delayed_works, struct work_struct, entry); - struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); trace_workqueue_activate_work(work); - move_linked_works(work, pos, NULL); + move_linked_works(work, &cwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; } @@ -1804,7 +1930,8 @@ __releases(&gcwq->lock) __acquires(&gcwq->lock) { struct cpu_workqueue_struct *cwq = get_work_cwq(work); - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; struct hlist_head *bwh = busy_worker_head(gcwq, work); bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; work_func_t f = work->func; @@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* + * Ensure we're on the correct CPU. DISASSOCIATED test is + * necessary to avoid spurious warnings from rescuers servicing the + * unbound or a disassociated gcwq. + */ + WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && + !(gcwq->flags & GCWQ_DISASSOCIATED) && + raw_smp_processor_id() != gcwq->cpu); + + /* * A single work shouldn't be executed concurrently by * multiple workers on a single cpu. Check whether anyone is * already processing the work. If so, defer the work to the @@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock) list_del_init(&work->entry); /* - * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, - * wake up another worker; otherwise, clear HIGHPRI_PENDING. - */ - if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { - struct work_struct *nwork = list_first_entry(&gcwq->worklist, - struct work_struct, entry); - - if (!list_empty(&gcwq->worklist) && - get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) - wake_up_worker(gcwq); - else - gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; - } - - /* * CPU intensive works don't participate in concurrency * management. They're the scheduler's responsibility. */ if (unlikely(cpu_intensive)) worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + /* + * Unbound gcwq isn't concurrency managed and work items should be + * executed ASAP. Wake up another worker if necessary. + */ + if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) + wake_up_worker(pool); + spin_unlock_irq(&gcwq->lock); work_clear_pending(work); @@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker) static int worker_thread(void *__worker) { struct worker *worker = __worker; - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; /* tell the scheduler that this is a workqueue worker */ worker->task->flags |= PF_WQ_WORKER; woke_up: spin_lock_irq(&gcwq->lock); - /* DIE can be set only while we're idle, checking here is enough */ - if (worker->flags & WORKER_DIE) { + /* + * DIE can be set only while idle and REBIND set while busy has + * @worker->rebind_work scheduled. Checking here is enough. + */ + if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { spin_unlock_irq(&gcwq->lock); - worker->task->flags &= ~PF_WQ_WORKER; - return 0; + + if (worker->flags & WORKER_DIE) { + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + idle_worker_rebind(worker); + goto woke_up; } worker_leave_idle(worker); recheck: /* no more worker necessary? */ - if (!need_more_worker(gcwq)) + if (!need_more_worker(pool)) goto sleep; /* do we need to manage? */ - if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + if (unlikely(!may_start_working(pool)) && manage_workers(worker)) goto recheck; /* @@ -1979,7 +2117,7 @@ recheck: do { struct work_struct *work = - list_first_entry(&gcwq->worklist, + list_first_entry(&pool->worklist, struct work_struct, entry); if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { @@ -1991,11 +2129,11 @@ recheck: move_linked_works(work, &worker->scheduled, NULL); process_scheduled_works(worker); } - } while (keep_working(gcwq)); + } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP, false); sleep: - if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) goto recheck; /* @@ -2053,14 +2191,15 @@ repeat: for_each_mayday_cpu(cpu, wq->mayday_mask) { unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = cwq->pool; + struct global_cwq *gcwq = pool->gcwq; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); mayday_clear_cpu(cpu, wq->mayday_mask); /* migrate to the target cpu if possible */ - rescuer->gcwq = gcwq; + rescuer->pool = pool; worker_maybe_bind_and_lock(rescuer); /* @@ -2068,7 +2207,7 @@ repeat: * process'em. */ BUG_ON(!list_empty(&rescuer->scheduled)); - list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_cwq(work) == cwq) move_linked_works(work, scheduled, &n); @@ -2079,8 +2218,8 @@ repeat: * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ - if (keep_working(gcwq)) - wake_up_worker(gcwq); + if (keep_working(pool)) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); } @@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = cwq->gcwq; + struct global_cwq *gcwq = cwq->pool->gcwq; spin_lock_irq(&gcwq->lock); @@ -2421,9 +2560,9 @@ reflush: struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); bool drained; - spin_lock_irq(&cwq->gcwq->lock); + spin_lock_irq(&cwq->pool->gcwq->lock); drained = !cwq->nr_active && list_empty(&cwq->delayed_works); - spin_unlock_irq(&cwq->gcwq->lock); + spin_unlock_irq(&cwq->pool->gcwq->lock); if (drained) continue; @@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, */ smp_rmb(); cwq = get_work_cwq(work); - if (unlikely(!cwq || gcwq != cwq->gcwq)) + if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; } else if (wait_executing) { worker = find_worker_executing_work(gcwq, work); @@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) flags |= WQ_RESCUER; - /* - * Unbound workqueues aren't concurrency managed and should be - * dispatched to workers immediately. - */ - if (flags & WQ_UNBOUND) - flags |= WQ_HIGHPRI; - max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); struct global_cwq *gcwq = get_gcwq(cpu); + int pool_idx = (bool)(flags & WQ_HIGHPRI); BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->gcwq = gcwq; + cwq->pool = &gcwq->pools[pool_idx]; cwq->wq = wq; cwq->flush_color = -1; cwq->max_active = max_active; @@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be detached from CPU, running - * it with unbound (rogue) workers and allowing it to be reattached - * later if the cpu comes back online. A separate thread is created - * to govern a gcwq in such state and is called the trustee of the - * gcwq. - * - * Trustee states and their descriptions. - * - * START Command state used on startup. On CPU_DOWN_PREPARE, a - * new trustee is started with this state. - * - * IN_CHARGE Once started, trustee will enter this state after - * assuming the manager role and making all existing - * workers rogue. DOWN_PREPARE waits for trustee to - * enter this state. After reaching IN_CHARGE, trustee - * tries to execute the pending worklist until it's empty - * and the state is set to BUTCHER, or the state is set - * to RELEASE. - * - * BUTCHER Command state which is set by the cpu callback after - * the cpu has went down. Once this state is set trustee - * knows that there will be no new works on the worklist - * and once the worklist is empty it can proceed to - * killing idle workers. - * - * RELEASE Command state which is set by the cpu callback if the - * cpu down has been canceled or it has come online - * again. After recognizing this state, trustee stops - * trying to drain or butcher and clears ROGUE, rebinds - * all remaining workers back to the cpu and releases - * manager role. - * - * DONE Trustee will enter this state after BUTCHER or RELEASE - * is complete. - * - * trustee CPU draining - * took over down complete - * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE - * | | ^ - * | CPU is back online v return workers | - * ----------------> RELEASE -------------- + * This is solved by allowing a gcwq to be disassociated from the CPU + * running as an unbound one and allowing it to be reattached later if the + * cpu comes back online. */ -/** - * trustee_wait_event_timeout - timed event wait for trustee - * @cond: condition to wait for - * @timeout: timeout in jiffies - * - * wait_event_timeout() for trustee to use. Handles locking and - * checks for RELEASE request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * Positive indicating left time if @cond is satisfied, 0 if timed - * out, -1 if canceled. - */ -#define trustee_wait_event_timeout(cond, timeout) ({ \ - long __ret = (timeout); \ - while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ - __ret) { \ - spin_unlock_irq(&gcwq->lock); \ - __wait_event_timeout(gcwq->trustee_wait, (cond) || \ - (gcwq->trustee_state == TRUSTEE_RELEASE), \ - __ret); \ - spin_lock_irq(&gcwq->lock); \ - } \ - gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ -}) - -/** - * trustee_wait_event - event wait for trustee - * @cond: condition to wait for - * - * wait_event() for trustee to use. Automatically handles locking and - * checks for CANCEL request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * 0 if @cond is satisfied, -1 if canceled. - */ -#define trustee_wait_event(cond) ({ \ - long __ret1; \ - __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ - __ret1 < 0 ? -1 : 0; \ -}) - -static int __cpuinit trustee_thread(void *__gcwq) +/* claim manager positions of all pools */ +static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) { - struct global_cwq *gcwq = __gcwq; - struct worker *worker; - struct work_struct *work; - struct hlist_node *pos; - long rc; - int i; - - BUG_ON(gcwq->cpu != smp_processor_id()); + struct worker_pool *pool; + for_each_worker_pool(pool, gcwq) + mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); spin_lock_irq(&gcwq->lock); - /* - * Claim the manager position and make all workers rogue. - * Trustee must be bound to the target cpu and can't be - * cancelled. - */ - BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); - BUG_ON(rc < 0); - - gcwq->flags |= GCWQ_MANAGING_WORKERS; - - list_for_each_entry(worker, &gcwq->idle_list, entry) - worker->flags |= WORKER_ROGUE; +} - for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_ROGUE; +/* release manager positions */ +static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) +{ + struct worker_pool *pool; - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the rogue flag. This is - * necessary as scheduler callbacks may be invoked from other - * cpus. - */ spin_unlock_irq(&gcwq->lock); - schedule(); - spin_lock_irq(&gcwq->lock); + for_each_worker_pool(pool, gcwq) + mutex_unlock(&pool->manager_mutex); +} - /* - * Sched callbacks are disabled now. Zap nr_running. After - * this, nr_running stays zero and need_more_worker() and - * keep_working() are always true as long as the worklist is - * not empty. - */ - atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); +static void gcwq_unbind_fn(struct work_struct *work) +{ + struct global_cwq *gcwq = get_gcwq(smp_processor_id()); + struct worker_pool *pool; + struct worker *worker; + struct hlist_node *pos; + int i; - spin_unlock_irq(&gcwq->lock); - del_timer_sync(&gcwq->idle_timer); - spin_lock_irq(&gcwq->lock); + BUG_ON(gcwq->cpu != smp_processor_id()); - /* - * We're now in charge. Notify and proceed to drain. We need - * to keep the gcwq running during the whole CPU down - * procedure as other cpu hotunplug callbacks may need to - * flush currently running tasks. - */ - gcwq->trustee_state = TRUSTEE_IN_CHARGE; - wake_up_all(&gcwq->trustee_wait); + gcwq_claim_management_and_lock(gcwq); /* - * The original cpu is in the process of dying and may go away - * anytime now. When that happens, we and all workers would - * be migrated to other cpus. Try draining any left work. We - * want to get it over with ASAP - spam rescuers, wake up as - * many idlers as necessary and create new ones till the - * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezable cwqs. Don't declare - * completion while frozen. + * We've claimed all manager positions. Make all workers unbound + * and set DISASSOCIATED. Before this, all workers except for the + * ones which are still executing works from before the last CPU + * down must be on the cpu. After this, they may become diasporas. */ - while (gcwq->nr_workers != gcwq->nr_idle || - gcwq->flags & GCWQ_FREEZING || - gcwq->trustee_state == TRUSTEE_IN_CHARGE) { - int nr_works = 0; - - list_for_each_entry(work, &gcwq->worklist, entry) { - send_mayday(work); - nr_works++; - } + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags |= WORKER_UNBOUND; - list_for_each_entry(worker, &gcwq->idle_list, entry) { - if (!nr_works--) - break; - wake_up_process(worker->task); - } + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_UNBOUND; - if (need_to_create_worker(gcwq)) { - spin_unlock_irq(&gcwq->lock); - worker = create_worker(gcwq, false); - spin_lock_irq(&gcwq->lock); - if (worker) { - worker->flags |= WORKER_ROGUE; - start_worker(worker); - } - } + gcwq->flags |= GCWQ_DISASSOCIATED; - /* give a breather */ - if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) - break; - } + gcwq_release_management_and_unlock(gcwq); /* - * Either all works have been scheduled and cpu is down, or - * cpu down has already been canceled. Wait for and butcher - * all workers till we're canceled. + * Call schedule() so that we cross rq->lock and thus can guarantee + * sched callbacks see the %WORKER_UNBOUND flag. This is necessary + * as scheduler callbacks may be invoked from other cpus. */ - do { - rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); - while (!list_empty(&gcwq->idle_list)) - destroy_worker(list_first_entry(&gcwq->idle_list, - struct worker, entry)); - } while (gcwq->nr_workers && rc >= 0); + schedule(); /* - * At this point, either draining has completed and no worker - * is left, or cpu down has been canceled or the cpu is being - * brought back up. There shouldn't be any idle one left. - * Tell the remaining busy ones to rebind once it finishes the - * currently scheduled works by scheduling the rebind_work. + * Sched callbacks are disabled now. Zap nr_running. After this, + * nr_running stays zero and need_more_worker() and keep_working() + * are always true as long as the worklist is not empty. @gcwq now + * behaves as unbound (in terms of concurrency management) gcwq + * which is served by workers tied to the CPU. + * + * On return from this function, the current worker would trigger + * unbound chain execution of pending work items if other workers + * didn't already. */ - WARN_ON(!list_empty(&gcwq->idle_list)); - - for_each_busy_worker(worker, i, pos, gcwq) { - struct work_struct *rebind_work = &worker->rebind_work; - - /* - * Rebind_work may race with future cpu hotplug - * operations. Use a separate flag to mark that - * rebinding is scheduled. - */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; - - /* queue rebind_work, wq doesn't matter, use the default one */ - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } - - /* relinquish manager role */ - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - - /* notify completion */ - gcwq->trustee = NULL; - gcwq->trustee_state = TRUSTEE_DONE; - wake_up_all(&gcwq->trustee_wait); - spin_unlock_irq(&gcwq->lock); - return 0; + for_each_worker_pool(pool, gcwq) + atomic_set(get_pool_nr_running(pool), 0); } -/** - * wait_trustee_state - wait for trustee to enter the specified state - * @gcwq: gcwq the trustee of interest belongs to - * @state: target state to wait for - * - * Wait for the trustee to reach @state. DONE is already matched. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by cpu_callback. +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. */ -static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - if (!(gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE)) { - spin_unlock_irq(&gcwq->lock); - __wait_event(gcwq->trustee_wait, - gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE); - spin_lock_irq(&gcwq->lock); - } -} - -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); - struct task_struct *new_trustee = NULL; - struct worker *uninitialized_var(new_worker); - unsigned long flags; - - action &= ~CPU_TASKS_FROZEN; + struct worker_pool *pool; - switch (action) { - case CPU_DOWN_PREPARE: - new_trustee = kthread_create(trustee_thread, gcwq, - "workqueue_trustee/%d\n", cpu); - if (IS_ERR(new_trustee)) - return notifier_from_errno(PTR_ERR(new_trustee)); - kthread_bind(new_trustee, cpu); - /* fall through */ + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - new_worker = create_worker(gcwq, false); - if (!new_worker) { - if (new_trustee) - kthread_stop(new_trustee); - return NOTIFY_BAD; - } - } - - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); + for_each_worker_pool(pool, gcwq) { + struct worker *worker; - switch (action) { - case CPU_DOWN_PREPARE: - /* initialize trustee and tell it to acquire the gcwq */ - BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); - gcwq->trustee = new_trustee; - gcwq->trustee_state = TRUSTEE_START; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); - /* fall through */ - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - gcwq->first_idle = new_worker; - break; + if (pool->nr_workers) + continue; - case CPU_DYING: - /* - * Before this, the trustee and all workers except for - * the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they'll all be diasporas. - */ - gcwq->flags |= GCWQ_DISASSOCIATED; - break; + worker = create_worker(pool); + if (!worker) + return NOTIFY_BAD; - case CPU_POST_DEAD: - gcwq->trustee_state = TRUSTEE_BUTCHER; - /* fall through */ - case CPU_UP_CANCELED: - destroy_worker(gcwq->first_idle); - gcwq->first_idle = NULL; + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } break; case CPU_DOWN_FAILED: case CPU_ONLINE: + gcwq_claim_management_and_lock(gcwq); gcwq->flags &= ~GCWQ_DISASSOCIATED; - if (gcwq->trustee_state != TRUSTEE_DONE) { - gcwq->trustee_state = TRUSTEE_RELEASE; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_DONE); - } - - /* - * Trustee is done and there might be no worker left. - * Put the first_idle in and request a real manager to - * take a look. - */ - spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - gcwq->flags |= GCWQ_MANAGE_WORKERS; - start_worker(gcwq->first_idle); - gcwq->first_idle = NULL; + rebind_workers(gcwq); + gcwq_release_management_and_unlock(gcwq); break; } + return NOTIFY_OK; +} - spin_unlock_irqrestore(&gcwq->lock, flags); +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct work_struct unbind_work; - return notifier_from_errno(0); + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* unbinding should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); + schedule_work_on(cpu, &unbind_work); + flush_work(&unbind_work); + break; + } + return NOTIFY_OK; } #ifdef CONFIG_SMP @@ -3746,6 +3653,7 @@ void thaw_workqueues(void) for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; struct workqueue_struct *wq; spin_lock_irq(&gcwq->lock); @@ -3767,7 +3675,8 @@ void thaw_workqueues(void) cwq_activate_first_delayed(cwq); } - wake_up_worker(gcwq); + for_each_worker_pool(pool, gcwq) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); } @@ -3783,46 +3692,57 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); + cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; spin_lock_init(&gcwq->lock); - INIT_LIST_HEAD(&gcwq->worklist); gcwq->cpu = cpu; gcwq->flags |= GCWQ_DISASSOCIATED; - INIT_LIST_HEAD(&gcwq->idle_list); for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) INIT_HLIST_HEAD(&gcwq->busy_hash[i]); - init_timer_deferrable(&gcwq->idle_timer); - gcwq->idle_timer.function = idle_worker_timeout; - gcwq->idle_timer.data = (unsigned long)gcwq; + for_each_worker_pool(pool, gcwq) { + pool->gcwq = gcwq; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; - setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, - (unsigned long)gcwq); + setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, + (unsigned long)pool); - ida_init(&gcwq->worker_ida); + mutex_init(&pool->manager_mutex); + ida_init(&pool->worker_ida); + } - gcwq->trustee_state = TRUSTEE_DONE; - init_waitqueue_head(&gcwq->trustee_wait); + init_waitqueue_head(&gcwq->rebind_hold); } /* create the initial worker */ for_each_online_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *worker; + struct worker_pool *pool; if (cpu != WORK_CPU_UNBOUND) gcwq->flags &= ~GCWQ_DISASSOCIATED; - worker = create_worker(gcwq, true); - BUG_ON(!worker); - spin_lock_irq(&gcwq->lock); - start_worker(worker); - spin_unlock_irq(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) { + struct worker *worker; + + worker = create_worker(pool); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } } system_wq = alloc_workqueue("events", 0, 0); |