diff options
Diffstat (limited to 'kernel')
44 files changed, 2448 insertions, 1378 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764..e5602d32acb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ - async.o range.o groups.o lglock.o + async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 3a5ca582ba1..ed206fd88cc 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -250,7 +250,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; } @@ -259,7 +258,7 @@ static void untag_chunk(struct node *p) fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { - free_chunk(new); + fsnotify_put_mark(&new->mark); goto Fallback; } @@ -293,7 +292,7 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); + fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; Fallback: @@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) entry = &chunk->mark; if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { - free_chunk(chunk); + fsnotify_put_mark(entry); return -ENOSPC; } @@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) insert_hash(chunk); spin_unlock(&hash_lock); spin_unlock(&entry->lock); + fsnotify_put_mark(entry); /* drop initial reference */ return 0; } @@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) fsnotify_duplicate_mark(chunk_entry, old_entry); if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { spin_unlock(&old_entry->lock); - free_chunk(chunk); + fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; } @@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ - fsnotify_put_mark(old_entry); /* and kill it */ return 0; } @@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); evict_chunk(chunk); - fsnotify_put_mark(entry); + + /* + * We are guaranteed to have at least one reference to the mark from + * either the inode or the caller of fsnotify_destroy_mark(). + */ + BUG_ON(atomic_read(&entry->refcnt) < 1); } static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, diff --git a/kernel/cpu.c b/kernel/cpu.c index 14d32588ccc..e615dfbcf79 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } + smpboot_park_threads(cpu); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ + smpboot_unpark_threads(cpu); cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); - goto out_release; } BUG_ON(cpu_online(cpu)); @@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out; } + ret = smpboot_create_threads(cpu); + if (ret) + goto out; + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; @@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + /* Wake the per cpu threads */ + smpboot_unpark_threads(cpu); + /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8b68ce78ff1..be7b33b73d3 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -12,6 +12,7 @@ #include <linux/kdb.h> #include <linux/kdebug.h> #include <linux/export.h> +#include <linux/hardirq.h> #include "kdb_private.h" #include "../debug_core.h" @@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks) if (atomic_read(&kgdb_setting_breakpoint)) reason = KDB_REASON_KEYBOARD; + if (in_nmi()) + reason = KDB_REASON_NMI; + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { if ((bp->bp_enabled) && (bp->bp_addr == addr)) { reason = KDB_REASON_BREAK; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index bb9520f0f6f..0a69d2adc4f 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -715,9 +715,6 @@ kdb_printit: /* check for having reached the LINES number of printed lines */ if (kdb_nextline == linecount) { char buf1[16] = ""; -#if defined(CONFIG_SMP) - char buf2[32]; -#endif /* Watch out for recursion here. Any routine that calls * kdb_printf will come back through here. And kdb_read @@ -732,14 +729,6 @@ kdb_printit: if (moreprompt == NULL) moreprompt = "more> "; -#if defined(CONFIG_SMP) - if (strchr(moreprompt, '%')) { - sprintf(buf2, moreprompt, get_cpu()); - put_cpu(); - moreprompt = buf2; - } -#endif - kdb_input_flush(); c = console_drivers; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 1f91413edb8..31df1706b9a 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); static char *__env[] = { #if defined(CONFIG_SMP) "PROMPT=[%d]kdb> ", - "MOREPROMPT=[%d]more> ", #else "PROMPT=kdb> ", - "MOREPROMPT=more> ", #endif + "MOREPROMPT=more> ", "RADIX=16", "MDCOUNT=8", /* lines of md output */ KDB_PLATFORM_ENV, @@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, *cmdbuf = '\0'; *(cmd_hist[cmd_head]) = '\0'; - if (KDB_FLAG(ONLY_DO_DUMP)) { - /* kdb is off but a catastrophic error requires a dump. - * Take the dump and reboot. - * Turn on logging so the kdb output appears in the log - * buffer in the dump. - */ - const char *setargs[] = { "set", "LOGGING", "1" }; - kdb_set(2, setargs); - kdb_reboot(0, NULL); - /*NOTREACHED*/ - } - do_full_getstr: #if defined(CONFIG_SMP) snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6581a040f39..98d4597f43d 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -153,7 +153,8 @@ put_callchain_entry(int rctx) put_recursion_context(__get_cpu_var(callchain_recursion), rctx); } -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) { int rctx; struct perf_callchain_entry *entry; @@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) } if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f1cf0edeb39..7fee567153f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1253,7 +1253,7 @@ retry: /* * Cross CPU call to disable a performance event */ -static int __perf_event_disable(void *info) +int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ -static int perf_release(struct inode *inode, struct file *file) +static void put_event(struct perf_event *event) { - struct perf_event *event = file->private_data; struct task_struct *owner; - file->private_data = NULL; + if (!atomic_long_dec_and_test(&event->refcount)) + return; rcu_read_lock(); owner = ACCESS_ONCE(event->owner); @@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(owner); } - return perf_event_release_kernel(event); + perf_event_release_kernel(event); +} + +static int perf_release(struct inode *inode, struct file *file) +{ + put_event(file->private_data); + return 0; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) @@ -3227,7 +3233,7 @@ unlock: static const struct file_operations perf_fops; -static struct perf_event *perf_fget_light(int fd, int *fput_needed) +static struct file *perf_fget_light(int fd, int *fput_needed) { struct file *file; @@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) return ERR_PTR(-EBADF); } - return file->private_data; + return file; } static int perf_event_set_output(struct perf_event *event, @@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { + struct file *output_file = NULL; struct perf_event *output_event = NULL; int fput_needed = 0; int ret; if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); + output_file = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_file)) + return PTR_ERR(output_file); + output_event = output_file->private_data; } ret = perf_event_set_output(event, output_event); if (output_event) - fput_light(output_event->filp, fput_needed); + fput_light(output_file, fput_needed); return ret; } @@ -4039,7 +4047,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(regs); + data->callchain = perf_callchain(event, regs); if (data->callchain) size += data->callchain->nr; @@ -5209,7 +5217,8 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) + struct pt_regs *regs, struct hlist_head *head, int rctx, + struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; @@ -5228,6 +5237,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_swevent_event(event, count, &data, regs); } + /* + * If we got specified a target task, also iterate its context and + * deliver this event there too. + */ + if (task && task != current) { + struct perf_event_context *ctx; + struct trace_entry *entry = record; + + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + if (!ctx) + goto unlock; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->attr.type != PERF_TYPE_TRACEPOINT) + continue; + if (event->attr.config != entry->type) + continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } +unlock: + rcu_read_unlock(); + } + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); @@ -5924,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->mmap_mutex); + atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; @@ -6234,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); + group_file = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_file)) { + err = PTR_ERR(group_file); goto err_fd; } - group_file = group_leader->filp; + group_leader = group_file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6376,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -6470,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_free; } - event->filp = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); @@ -6552,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event, * Release the parent event, if this was the last * reference to it. */ - fput(parent_event->filp); + put_event(parent_event); } static void @@ -6628,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * __perf_event_exit_task() * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -6698,7 +6730,7 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - fput(parent->filp); + put_event(parent); perf_group_detach(event); list_del_event(event, ctx); @@ -6778,6 +6810,12 @@ inherit_event(struct perf_event *parent_event, NULL, NULL); if (IS_ERR(child_event)) return child_event; + + if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + free_event(child_event); + return NULL; + } + get_ctx(child_ctx); /* @@ -6819,14 +6857,6 @@ inherit_event(struct perf_event *parent_event, raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* * Link this into the parent event's child list */ WARN_ON_ONCE(parent_event->ctx->parent_ctx); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bb38c4d3ee1..9a7b487c6fe 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att int old_type = bp->attr.bp_type; int err = 0; - perf_event_disable(bp); + /* + * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it + * will not be possible to raise IPIs that invoke __perf_event_disable. + * So call the function directly after making sure we are targeting the + * current task. + */ + if (irqs_disabled() && bp->ctx && bp->ctx->task == current) + __perf_event_disable(bp); + else + perf_event_disable(bp); bp->attr.bp_addr = attr->bp_addr; bp->attr.bp_type = attr->bp_type; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b0b107f90af..a096c19f2c2 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle, } /* Callchain handling */ -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); diff --git a/kernel/fork.c b/kernel/fork.c index 3bd2280d79f..2c8857e1285 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - if (file && uprobe_mmap(tmp)) - goto out; + if (file) + uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); diff --git a/kernel/futex.c b/kernel/futex.c index e2b0fb9a0b3..3717e7b306e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and - * complete the acquisition of the rt_mutex prior to returning to userspace. - * This ensures the rt_mutex maintains an owner when it has waiters; without - * one, the pi logic wouldn't know which task to boost/deboost, if there was a - * need to. + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: @@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (uaddr == uaddr2) + return -EINVAL; + if (!bitset) return -EINVAL; @@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * signal. futex_unlock_pi() will not destroy the lock_ptr nor * the pi_state. */ - WARN_ON(!&q.pi_state); + WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); debug_rt_mutex_free_waiter(&rt_waiter); @@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * fault, unlock the rt_mutex and return the fault to userspace. */ if (ret == -EFAULT) { - if (rt_mutex_owner(pi_mutex) == current) + if (pi_mutex && rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a8e8f05962..4c69326aa77 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -944,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* + * Drivers are often written to work w/o knowledge about the + * underlying irq chip implementation, so a request for a + * threaded irq without a primary hard irq context handler + * requires the ONESHOT flag to be set. Some irq chips like + * MSI based interrupts are per se one shot safe. Check the + * chip flags, so we can avoid the unmask dance at the end of + * the threaded handler for those. + */ + if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) + new->flags &= ~IRQF_ONESHOT; + + /* * The following block of code has to be executed atomically */ raw_spin_lock_irqsave(&desc->lock, flags); @@ -1017,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->thread_mask = 1 << ffz(thread_mask); - } else if (new->handler == irq_default_primary_handler) { + } else if (new->handler == irq_default_primary_handler && + !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it diff --git a/kernel/kthread.c b/kernel/kthread.c index b579af57ea1..146a6fa9682 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -37,11 +37,20 @@ struct kthread_create_info }; struct kthread { - int should_stop; + unsigned long flags; + unsigned int cpu; void *data; + struct completion parked; struct completion exited; }; +enum KTHREAD_BITS { + KTHREAD_IS_PER_CPU = 0, + KTHREAD_SHOULD_STOP, + KTHREAD_SHOULD_PARK, + KTHREAD_IS_PARKED, +}; + #define to_kthread(tsk) \ container_of((tsk)->vfork_done, struct kthread, exited) @@ -52,13 +61,29 @@ struct kthread { * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ -int kthread_should_stop(void) +bool kthread_should_stop(void) { - return to_kthread(current)->should_stop; + return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); /** + * kthread_should_park - should this kthread park now? + * + * When someone calls kthread_park() on your kthread, it will be woken + * and this will return true. You should then do the necessary + * cleanup and call kthread_parkme() + * + * Similar to kthread_should_stop(), but this keeps the thread alive + * and in a park position. kthread_unpark() "restarts" the thread and + * calls the thread function again. + */ +bool kthread_should_park(void) +{ + return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); +} + +/** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * @@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task) return to_kthread(task)->data; } +static void __kthread_parkme(struct kthread *self) +{ + __set_current_state(TASK_INTERRUPTIBLE); + while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { + if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) + complete(&self->parked); + schedule(); + __set_current_state(TASK_INTERRUPTIBLE); + } + clear_bit(KTHREAD_IS_PARKED, &self->flags); + __set_current_state(TASK_RUNNING); +} + +void kthread_parkme(void) +{ + __kthread_parkme(to_kthread(current)); +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -105,9 +148,10 @@ static int kthread(void *_create) struct kthread self; int ret; - self.should_stop = 0; + self.flags = 0; self.data = data; init_completion(&self.exited); + init_completion(&self.parked); current->vfork_done = &self.exited; /* OK, tell user we're spawned, wait for stop or wakeup */ @@ -117,9 +161,11 @@ static int kthread(void *_create) schedule(); ret = -EINTR; - if (!self.should_stop) - ret = threadfn(data); + if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { + __kthread_parkme(&self); + ret = threadfn(data); + } /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } @@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create) * Returns a task_struct or ERR_PTR(-ENOMEM). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, - int node, + void *data, int node, const char namefmt[], ...) { @@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); +static void __kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); + p->flags |= PF_THREAD_BOUND; +} + /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). @@ -226,14 +278,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) WARN_ON(1); return; } - - /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + __kthread_bind(p, cpu); } EXPORT_SYMBOL(kthread_bind); /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: This helper function creates and names a kernel thread + * The thread will be woken and put into park mode. + */ +struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + void *data, unsigned int cpu, + const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + cpu); + if (IS_ERR(p)) + return p; + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; + /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ + kthread_park(p); + return p; +} + +static struct kthread *task_get_live_kthread(struct task_struct *k) +{ + struct kthread *kthread; + + get_task_struct(k); + kthread = to_kthread(k); + /* It might have exited */ + barrier(); + if (k->vfork_done != NULL) + return kthread; + return NULL; +} + +/** + * kthread_unpark - unpark a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return false, wakes it, and + * waits for it to return. If the thread is marked percpu then its + * bound to the cpu again. + */ +void kthread_unpark(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + + if (kthread) { + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu); + wake_up_process(k); + } + } + put_task_struct(k); +} + +/** + * kthread_park - park a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return true, wakes it, and + * waits for it to return. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will park without + * calling threadfn(). + * + * Returns 0 if the thread is parked, -ENOSYS if the thread exited. + * If called by the kthread itself just the park bit is set. + */ +int kthread_park(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + int ret = -ENOSYS; + + if (kthread) { + if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); + } + } + ret = 0; + } + put_task_struct(k); + return ret; +} + +/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * @@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind); */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread; + struct kthread *kthread = task_get_live_kthread(k); int ret; trace_sched_kthread_stop(k); - get_task_struct(k); - - kthread = to_kthread(k); - barrier(); /* it might have exited */ - if (k->vfork_done != NULL) { - kthread->should_stop = 1; + if (kthread) { + set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); wake_up_process(k); wait_for_completion(&kthread->exited); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b3c7fd55425..6144bab8fd8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, */ tmp.data = ¤t->nsproxy->pid_ns->last_pid; - return proc_dointvec(&tmp, write, buffer, lenp, ppos); + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } +extern int pid_max; +static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, + .extra1 = &zero, + .extra2 = &pid_max, }, { } }; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1da39ea248f..c8b7446b27d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -178,9 +178,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); - /* Kick the lockup detector */ - lockup_detector_bootcpu_resume(); - Enable_cpus: enable_nonboot_cpus(); diff --git a/kernel/printk.c b/kernel/printk.c index 6a76ab9d447..66a2ea37b57 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1034,6 +1034,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); len += msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; idx = log_next(idx); seq++; } @@ -1046,6 +1047,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); len -= msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; idx = log_next(idx); seq++; } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f280e542e3e..f494157be2f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -133,13 +133,12 @@ static int rcu_scheduler_fully_active __read_mostly; */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -1390,17 +1389,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - /* - * If there is an rcu_barrier() operation in progress, then - * only the task doing that operation is permitted to adopt - * callbacks. To do otherwise breaks rcu_barrier() and friends - * by causing them to fail to wait for the callbacks in the - * orphanage. - */ - if (rsp->rcu_barrier_in_progress && - rsp->rcu_barrier_in_progress != current) - return; - /* Do the accounting first. */ rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; @@ -1455,9 +1443,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them, if there is no _rcu_barrier() instance running. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. + * adopting them. There can only be one CPU hotplug operation at a time, + * so no other CPU can be attempting to update rcu_cpu_kthread_task. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { @@ -1468,8 +1455,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ - rcu_stop_cpu_kthread(cpu); - rcu_node_kthread_setaffinity(rnp, -1); + rcu_boost_kthread_setaffinity(rnp, -1); /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ @@ -1515,14 +1501,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); + init_callback_list(rdp); + /* Disallow further callbacks on this CPU. */ + rdp->nxttail[RCU_NEXT_TAIL] = NULL; } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ -} - static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } @@ -1941,6 +1926,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ + if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { + /* _call_rcu() is illegal on offline CPU; leak the callback. */ + WARN_ON_ONCE(1); + local_irq_restore(flags); + return; + } ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; @@ -2326,13 +2317,10 @@ static void rcu_barrier_func(void *type) static void _rcu_barrier(struct rcu_state *rsp) { int cpu; - unsigned long flags; struct rcu_data *rdp; - struct rcu_data rd; unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); unsigned long snap_done; - init_rcu_head_on_stack(&rd.barrier_head); _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ @@ -2372,70 +2360,30 @@ static void _rcu_barrier(struct rcu_state *rsp) /* * Initialize the count to one rather than to zero in order to * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Also flag this task as doing - * an rcu_barrier(). This will prevent anyone else from adopting - * orphaned callbacks, which could cause otherwise failure if a - * CPU went offline and quickly came back online. To see this, - * consider the following sequence of events: - * - * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. - * 2. CPU 1 goes offline, orphaning its callbacks. - * 3. CPU 0 adopts CPU 1's orphaned callbacks. - * 4. CPU 1 comes back online. - * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. - * 6. Both rcu_barrier_callback() callbacks are invoked, awakening - * us -- but before CPU 1's orphaned callbacks are invoked!!! + * (or preemption of this task). Exclude CPU-hotplug operations + * to ensure that no offline CPU has callbacks queued. */ init_completion(&rsp->barrier_completion); atomic_set(&rsp->barrier_cpu_count, 1); - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rsp->rcu_barrier_in_progress = current; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + get_online_cpus(); /* - * Force every CPU with callbacks to register a new callback - * that will tell us when all the preceding callbacks have - * been invoked. If an offline CPU has callbacks, wait for - * it to either come back online or to finish orphaning those - * callbacks. + * Force each CPU with callbacks to register a new callback. + * When that callback is invoked, we will know that all of the + * corresponding CPU's preceding callbacks have been invoked. */ - for_each_possible_cpu(cpu) { - preempt_disable(); + for_each_online_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - if (cpu_is_offline(cpu)) { - _rcu_barrier_trace(rsp, "Offline", cpu, - rsp->n_barrier_done); - preempt_enable(); - while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) - schedule_timeout_interruptible(1); - } else if (ACCESS_ONCE(rdp->qlen)) { + if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); - preempt_enable(); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, rsp->n_barrier_done); - preempt_enable(); } } - - /* - * Now that all online CPUs have rcu_barrier_callback() callbacks - * posted, we can adopt all of the orphaned callbacks and place - * an rcu_barrier_callback() callback after them. When that is done, - * we are guaranteed to have an rcu_barrier_callback() callback - * following every callback that could possibly have been - * registered before _rcu_barrier() was called. - */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rcu_adopt_orphan_cbs(rsp); - rsp->rcu_barrier_in_progress = NULL; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rsp->barrier_cpu_count); - smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - rd.rsp = rsp; - rsp->call(&rd.barrier_head, rcu_barrier_callback); + put_online_cpus(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -2456,8 +2404,6 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); - - destroy_rcu_head_on_stack(&rd.barrier_head); } /** @@ -2523,6 +2469,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); @@ -2594,12 +2541,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: - rcu_node_kthread_setaffinity(rnp, -1); - rcu_cpu_kthread_setrt(cpu, 1); + rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - rcu_node_kthread_setaffinity(rnp, cpu); - rcu_cpu_kthread_setrt(cpu, 0); + rcu_boost_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4d29169f212..b82c88c1567 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -196,12 +196,6 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ - struct task_struct *node_kthread_task; - /* kthread that takes care of this rcu_node */ - /* structure, for example, awakening the */ - /* per-CPU kthreads as needed. */ - unsigned int node_kthread_status; - /* State of node_kthread_task for tracing. */ } ____cacheline_internodealigned_in_smp; /* @@ -398,9 +392,6 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ - struct task_struct *rcu_barrier_in_progress; - /* Task doing rcu_barrier(), */ - /* or NULL if no barrier. */ struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ @@ -468,7 +459,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); -static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); @@ -491,15 +481,9 @@ static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index); -static void invoke_rcu_node_kthread(struct rcu_node *rnp); -static void rcu_yield(void (*f)(unsigned long), unsigned long arg); + struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7f3244c0df0..c1961aed121 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include <linux/delay.h> +#include <linux/smpboot.h> #define RCU_KTHREAD_PRIO 1 @@ -1069,6 +1070,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) + wake_up_process(t); +} + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1141,17 +1152,6 @@ static int rcu_boost(struct rcu_node *rnp) } /* - * Timer handler to initiate waking up of boost kthreads that - * have yielded the CPU due to excessive numbers of tasks to - * boost. We wake up the per-rcu_node kthread, which in turn - * will wake up the booster kthread. - */ -static void rcu_boost_kthread_timer(unsigned long arg) -{ - invoke_rcu_node_kthread((struct rcu_node *)arg); -} - -/* * Priority-boosting kthread. One per leaf rcu_node and one for the * root rcu_node. */ @@ -1174,8 +1174,9 @@ static int rcu_boost_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; trace_rcu_utilization("End boost kthread@rcu_yield"); - rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + schedule_timeout_interruptible(2); trace_rcu_utilization("Start boost kthread@rcu_yield"); spincnt = 0; } @@ -1213,8 +1214,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore(&rnp->lock, flags); t = rnp->boost_kthread_task; - if (t != NULL) - wake_up_process(t); + if (t) + rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1231,8 +1232,10 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_save(flags); __this_cpu_write(rcu_cpu_has_work, 1); if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + current != __this_cpu_read(rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), + __this_cpu_read(rcu_cpu_kthread_status)); + } local_irq_restore(flags); } @@ -1245,21 +1248,6 @@ static bool rcu_is_callbacks_kthread(void) return __get_cpu_var(rcu_cpu_kthread_task) == current; } -/* - * Set the affinity of the boost kthread. The CPU-hotplug locks are - * held, so no one should be messing with the existence of the boost - * kthread. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm) -{ - struct task_struct *t; - - t = rnp->boost_kthread_task; - if (t != NULL) - set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1276,15 +1264,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * Returns zero if all is well, a negated errno otherwise. */ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index) + struct rcu_node *rnp) { + int rnp_index = rnp - &rsp->node[0]; unsigned long flags; struct sched_param sp; struct task_struct *t; if (&rcu_preempt_state != rsp) return 0; + + if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + return 0; + rsp->boost = 1; if (rnp->boost_kthread_task != NULL) return 0; @@ -1301,25 +1293,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Stop the RCU's per-CPU kthread when its CPU goes offline,. - */ -static void rcu_stop_cpu_kthread(int cpu) -{ - struct task_struct *t; - - /* Stop the CPU's kthread. */ - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t != NULL) { - per_cpu(rcu_cpu_kthread_task, cpu) = NULL; - kthread_stop(t); - } -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1327,112 +1300,22 @@ static void rcu_kthread_do_work(void) rcu_preempt_do_callbacks(); } -/* - * Wake up the specified per-rcu_node-structure kthread. - * Because the per-rcu_node kthreads are immortal, we don't need - * to do anything to keep them alive. - */ -static void invoke_rcu_node_kthread(struct rcu_node *rnp) +static void rcu_cpu_kthread_setup(unsigned int cpu) { - struct task_struct *t; - - t = rnp->node_kthread_task; - if (t != NULL) - wake_up_process(t); -} - -/* - * Set the specified CPU's kthread to run RT or not, as specified by - * the to_rt argument. The CPU-hotplug locks are held, so the task - * is not going away. - */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) -{ - int policy; struct sched_param sp; - struct task_struct *t; - - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t == NULL) - return; - if (to_rt) { - policy = SCHED_FIFO; - sp.sched_priority = RCU_KTHREAD_PRIO; - } else { - policy = SCHED_NORMAL; - sp.sched_priority = 0; - } - sched_setscheduler_nocheck(t, policy, &sp); -} - -/* - * Timer handler to initiate the waking up of per-CPU kthreads that - * have yielded the CPU due to excess numbers of RCU callbacks. - * We wake up the per-rcu_node kthread, which in turn will wake up - * the booster kthread. - */ -static void rcu_cpu_kthread_timer(unsigned long arg) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); - struct rcu_node *rnp = rdp->mynode; - atomic_or(rdp->grpmask, &rnp->wakemask); - invoke_rcu_node_kthread(rnp); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); } -/* - * Drop to non-real-time priority and yield, but only after posting a - * timer that will cause us to regain our real-time priority if we - * remain preempted. Either way, we restore our real-time priority - * before returning. - */ -static void rcu_yield(void (*f)(unsigned long), unsigned long arg) +static void rcu_cpu_kthread_park(unsigned int cpu) { - struct sched_param sp; - struct timer_list yield_timer; - int prio = current->rt_priority; - - setup_timer_on_stack(&yield_timer, f, arg); - mod_timer(&yield_timer, jiffies + 2); - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - set_user_nice(current, 19); - schedule(); - set_user_nice(current, 0); - sp.sched_priority = prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); - del_timer(&yield_timer); + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; } -/* - * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. - * This can happen while the corresponding CPU is either coming online - * or going offline. We cannot wait until the CPU is fully online - * before starting the kthread, because the various notifier functions - * can wait for RCU grace periods. So we park rcu_cpu_kthread() until - * the corresponding CPU is online. - * - * Return 1 if the kthread needs to stop, 0 otherwise. - * - * Caller must disable bh. This function can momentarily enable it. - */ -static int rcu_cpu_kthread_should_stop(int cpu) +static int rcu_cpu_kthread_should_run(unsigned int cpu) { - while (cpu_is_offline(cpu) || - !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || - smp_processor_id() != cpu) { - if (kthread_should_stop()) - return 1; - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; - per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); - local_bh_enable(); - schedule_timeout_uninterruptible(1); - if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - local_bh_disable(); - } - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - return 0; + return __get_cpu_var(rcu_cpu_has_work); } /* @@ -1440,138 +1323,35 @@ static int rcu_cpu_kthread_should_stop(int cpu) * RCU softirq used in flavors and configurations of RCU that do not * support RCU priority boosting. */ -static int rcu_cpu_kthread(void *arg) +static void rcu_cpu_kthread(unsigned int cpu) { - int cpu = (int)(long)arg; - unsigned long flags; - int spincnt = 0; - unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); - char work; - char *workp = &per_cpu(rcu_cpu_has_work, cpu); + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); + char work, *workp = &__get_cpu_var(rcu_cpu_has_work); + int spincnt; - trace_rcu_utilization("Start CPU kthread@init"); - for (;;) { - *statusp = RCU_KTHREAD_WAITING; - trace_rcu_utilization("End CPU kthread@rcu_wait"); - rcu_wait(*workp != 0 || kthread_should_stop()); + for (spincnt = 0; spincnt < 10; spincnt++) { trace_rcu_utilization("Start CPU kthread@rcu_wait"); local_bh_disable(); - if (rcu_cpu_kthread_should_stop(cpu)) { - local_bh_enable(); - break; - } *statusp = RCU_KTHREAD_RUNNING; - per_cpu(rcu_cpu_kthread_loops, cpu)++; - local_irq_save(flags); + this_cpu_inc(rcu_cpu_kthread_loops); + local_irq_disable(); work = *workp; *workp = 0; - local_irq_restore(flags); + local_irq_enable(); if (work) rcu_kthread_do_work(); local_bh_enable(); - if (*workp != 0) - spincnt++; - else - spincnt = 0; - if (spincnt > 10) { - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("End CPU kthread@rcu_yield"); - rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); - trace_rcu_utilization("Start CPU kthread@rcu_yield"); - spincnt = 0; - } - } - *statusp = RCU_KTHREAD_STOPPED; - trace_rcu_utilization("End CPU kthread@term"); - return 0; -} - -/* - * Spawn a per-CPU kthread, setting up affinity and priority. - * Because the CPU hotplug lock is held, no other CPU will be attempting - * to manipulate rcu_cpu_kthread_task. There might be another CPU - * attempting to access it during boot, but the locking in kthread_bind() - * will enforce sufficient ordering. - * - * Please note that we cannot simply refuse to wake up the per-CPU - * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, - * which can result in softlockup complaints if the task ends up being - * idle for more than a couple of minutes. - * - * However, please note also that we cannot bind the per-CPU kthread to its - * CPU until that CPU is fully online. We also cannot wait until the - * CPU is fully online before we create its per-CPU kthread, as this would - * deadlock the system when CPU notifiers tried waiting for grace - * periods. So we bind the per-CPU kthread to its CPU only if the CPU - * is online. If its CPU is not yet fully online, then the code in - * rcu_cpu_kthread() will wait until it is fully online, and then do - * the binding. - */ -static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) -{ - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - per_cpu(rcu_cpu_kthread_task, cpu) != NULL) - return 0; - t = kthread_create_on_node(rcu_cpu_kthread, - (void *)(long)cpu, - cpu_to_node(cpu), - "rcuc/%d", cpu); - if (IS_ERR(t)) - return PTR_ERR(t); - if (cpu_online(cpu)) - kthread_bind(t, cpu); - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - per_cpu(rcu_cpu_kthread_task, cpu) = t; - wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ - return 0; -} - -/* - * Per-rcu_node kthread, which is in charge of waking up the per-CPU - * kthreads when needed. We ignore requests to wake up kthreads - * for offline CPUs, which is OK because force_quiescent_state() - * takes care of this case. - */ -static int rcu_node_kthread(void *arg) -{ - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp = (struct rcu_node *)arg; - struct sched_param sp; - struct task_struct *t; - - for (;;) { - rnp->node_kthread_status = RCU_KTHREAD_WAITING; - rcu_wait(atomic_read(&rnp->wakemask) != 0); - rnp->node_kthread_status = RCU_KTHREAD_RUNNING; - raw_spin_lock_irqsave(&rnp->lock, flags); - mask = atomic_xchg(&rnp->wakemask, 0); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { - if ((mask & 0x1) == 0) - continue; - preempt_disable(); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (!cpu_online(cpu) || t == NULL) { - preempt_enable(); - continue; - } - per_cpu(rcu_cpu_has_work, cpu) = 1; - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - preempt_enable(); + if (*workp == 0) { + trace_rcu_utilization("End CPU kthread@rcu_wait"); + *statusp = RCU_KTHREAD_WAITING; + return; } } - /* NOTREACHED */ - rnp->node_kthread_status = RCU_KTHREAD_STOPPED; - return 0; + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization("Start CPU kthread@rcu_yield"); + schedule_timeout_interruptible(2); + trace_rcu_utilization("End CPU kthread@rcu_yield"); + *statusp = RCU_KTHREAD_WAITING; } /* @@ -1583,17 +1363,17 @@ static int rcu_node_kthread(void *arg) * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { + struct task_struct *t = rnp->boost_kthread_task; + unsigned long mask = rnp->qsmaskinit; cpumask_var_t cm; int cpu; - unsigned long mask = rnp->qsmaskinit; - if (rnp->node_kthread_task == NULL) + if (!t) return; - if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - cpumask_clear(cm); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); @@ -1603,62 +1383,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpumask_clear_cpu(cpu, cm); WARN_ON_ONCE(cpumask_weight(cm) == 0); } - set_cpus_allowed_ptr(rnp->node_kthread_task, cm); - rcu_boost_kthread_setaffinity(rnp, cm); + set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -/* - * Spawn a per-rcu_node kthread, setting priority and affinity. - * Called during boot before online/offline can happen, or, if - * during runtime, with the main CPU-hotplug locks held. So only - * one of these can be executing at a time. - */ -static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - unsigned long flags; - int rnp_index = rnp - &rsp->node[0]; - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - rnp->qsmaskinit == 0) - return 0; - if (rnp->node_kthread_task == NULL) { - t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->node_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = 99; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - } - return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); -} +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; /* * Spawn all kthreads -- called as soon as the scheduler is running. */ static int __init rcu_spawn_kthreads(void) { - int cpu; struct rcu_node *rnp; + int cpu; rcu_scheduler_fully_active = 1; - for_each_possible_cpu(cpu) { + for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) - (void)rcu_spawn_one_cpu_kthread(cpu); - } + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); if (NUM_RCU_NODES > 1) { rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } return 0; } @@ -1670,11 +1424,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_scheduler_fully_active) { - (void)rcu_spawn_one_cpu_kthread(cpu); - if (rnp->node_kthread_task == NULL) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - } + if (rcu_scheduler_fully_active) + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1698,19 +1449,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -#ifdef CONFIG_HOTPLUG_CPU - -static void rcu_stop_cpu_kthread(int cpu) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ -} - -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index abffb486e94..ca9042ef368 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -51,8 +51,8 @@ static int show_rcubarrier(struct seq_file *m, void *unused) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", - rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', + seq_printf(m, "%s: bcc: %d nbd: %lu\n", + rsp->name, atomic_read(&rsp->barrier_cpu_count), rsp->n_barrier_done); return 0; @@ -108,11 +108,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->nxttail[RCU_WAIT_TAIL]], ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); #ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c/%d ktl=%x", + seq_printf(m, " kt=%d/%c ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), - per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_printf(m, " b=%ld", rdp->blimit); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d325c4b2dcb..d2b8939ccf7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1109,6 +1109,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } @@ -1713,6 +1715,19 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + p->se.avg.runnable_avg_period = 0; + p->se.avg.runnable_avg_sum = 0; +#ifdef CONFIG_SCHED_HMP + p->se.avg.hmp_last_up_migration = 0; + p->se.avg.hmp_last_down_migration = 0; +#endif +#endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -3142,6 +3157,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -3151,13 +3180,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) */ rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(utime, rtime, total); + else utime = rtime; /* @@ -3184,13 +3209,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) total = cputime.utime + cputime.stime; rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); @@ -4340,9 +4361,7 @@ recheck: */ if (unlikely(policy == p->policy && (!rt_policy(policy) || param->sched_priority == p->rt_priority))) { - - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return 0; } @@ -5300,27 +5319,17 @@ void idle_task_exit(void) } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } /* @@ -5348,9 +5357,6 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; - /* Ensure any throttled groups are reachable by pick_next_task */ - unthrottle_offline_cfs_rqs(rq); - for ( ; ; ) { /* * There's this thread running, bail when that's the only @@ -5614,8 +5620,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + calc_load_migrate(rq); break; #endif } @@ -6024,11 +6029,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * - * Iterate domains and sched_groups downward, assigning CPUs to be - * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing - * due to random perturbation self canceling, ie sw buddies pull - * their counterpart to their CPU's hw counterpart. - * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6042,40 +6042,8 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) { - struct sched_domain *tmp = sd; - struct sched_group *sg, *prev; - bool right; - - /* - * Traverse to first CPU in group, and count hops - * to cpu from there, switching direction on each - * hop, never ever pointing the last CPU rightward. - */ - do { - id = cpumask_first(sched_domain_span(tmp)); - prev = sg = tmp->groups; - right = 1; - - while (cpumask_first(sched_group_cpus(sg)) != id) - sg = sg->next; - - while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { - prev = sg; - sg = sg->next; - right = !right; - } - - /* A CPU went down, never point back to domain start. */ - if (right && cpumask_first(sched_group_cpus(sg->next)) == id) - right = false; - - sg = right ? sg->next : prev; - tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); - } while ((tmp = tmp->child)); - + if (sd) id = cpumask_first(sched_domain_span(sd)); - } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; @@ -7248,6 +7216,7 @@ int in_sched_functions(unsigned long addr) #ifdef CONFIG_CGROUP_SCHED struct task_group root_task_group; +LIST_HEAD(task_groups); #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d72586fdf66..23aa789c53e 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -65,8 +65,8 @@ static int convert_prio(int prio) int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask) { - int idx = 0; - int task_pri = convert_prio(p->prio); + int idx = 0; + int task_pri = convert_prio(p->prio); if (task_pri >= MAX_RT_PRIO) return 0; @@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, */ void cpupri_set(struct cpupri *cp, int cpu, int newpri) { - int *currpri = &cp->cpu_to_pri[cpu]; - int oldpri = *currpri; - int do_mb = 0; + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + int do_mb = 0; newpri = convert_prio(newpri); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea..b9d54d0d7bb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; - if (!se) - return; #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); + return; + } + + PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); @@ -85,6 +91,13 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->statistics.wait_count); #endif P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); + P(se->avg.usage_avg_sum); + P(se->avg.load_avg_contrib); + P(se->avg.decay_count); +#endif #undef PN #undef P } @@ -206,14 +219,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", - SPLIT_NS(cfs_rq->load_avg)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", - SPLIT_NS(cfs_rq->load_period)); - SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", - cfs_rq->load_contribution); - SEQ_printf(m, " .%-30s: %d\n", "load_tg", - atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic64_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); + SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg", + atomic_read(&cfs_rq->tg->usage_avg)); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 22321db6495..28b78081767 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -653,9 +653,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -675,10 +672,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -801,72 +794,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) -{ - struct task_group *tg = cfs_rq->tg; - long load_avg; - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - } -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; - - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) - return; - - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; - - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; - } - - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; - } - - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); - - while (cfs_rq->load_period > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; - } - - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); -} - static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) { long tg_weight; @@ -876,8 +804,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; + tg_weight = atomic64_read(&tg->load_avg); + tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -901,27 +829,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return shares; } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} # endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) @@ -939,6 +851,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; @@ -958,18 +872,507 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } +#endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47765 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +/* + * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent + * over-estimates when re-combining. + */ +static const u32 runnable_avg_yN_sum[] = { + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) { + int local_n; + if (!n) + return val; + else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* will be 32 bits if that's desirable */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) + * With a look-up table which covers k^n (n<PERIOD) + * + * To achieve constant time decay_load. + */ + if (unlikely(local_n >= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + n %= LOAD_AVG_PERIOD; + } + + val *= runnable_avg_yN_inv[local_n]; + return SRR(val, 32); } -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + */ +static u32 __compute_runnable_contrib(u64 n) { + u32 contrib = 0; + + if (likely(n <= LOAD_AVG_PERIOD)) + return runnable_avg_yN_sum[n]; + else if (unlikely(n >= LOAD_AVG_MAX_N)) + return LOAD_AVG_MAX; + + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ + do { + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + + n -= LOAD_AVG_PERIOD; + } while (n > LOAD_AVG_PERIOD); + + contrib = decay_load(contrib, n); + return contrib + runnable_avg_yN_sum[n]; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p1 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable, + int running) +{ + u64 delta, periods; + u32 runnable_contrib; + int delta_w, decayed = 0; + + delta = now - sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; + if (runnable) + sa->runnable_avg_sum += delta_w; + if (running) + sa->usage_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + delta -= delta_w; + + /* Figure out how many additional periods this update spans */ + periods = delta / 1024; + delta %= 1024; + + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, + periods + 1); + sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + periods + 1); + sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); + + /* Efficiently calculate \sum (1..n_period) 1024*y^i */ + runnable_contrib = __compute_runnable_contrib(periods); + if (runnable) + sa->runnable_avg_sum += runnable_contrib; + if (running) + sa->usage_avg_sum += runnable_contrib; + sa->runnable_avg_period += runnable_contrib; + } + + /* Remainder of delta accrued against u_0` */ + if (runnable) + sa->runnable_avg_sum += delta; + if (running) + sa->usage_avg_sum += delta; + sa->runnable_avg_period += delta; + + return decayed; +} + +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline u64 __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return 0; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count = 0; + + return decays; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) +{ + struct task_group *tg = cfs_rq->tg; + s64 tg_contrib; + + tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; + tg_contrib -= cfs_rq->tg_load_contrib; + + if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic64_add(tg_contrib, &tg->load_avg); + cfs_rq->tg_load_contrib += tg_contrib; + } +} + +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + long contrib, usage_contrib; + + /* The fraction of a cpu used by this cfs_rq */ + contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + contrib -= cfs_rq->tg_runnable_contrib; + + usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + usage_contrib -= cfs_rq->tg_usage_contrib; + + /* + * contrib/usage at this point represent deltas, only update if they + * are substantive. + */ + if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) || + (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) { + atomic_add(contrib, &tg->runnable_avg); + cfs_rq->tg_runnable_contrib += contrib; + + atomic_add(usage_contrib, &tg->usage_avg); + cfs_rq->tg_usage_contrib += usage_contrib; + } +} + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + struct task_group *tg = cfs_rq->tg; + int runnable_avg; + + u64 contrib; + + contrib = cfs_rq->tg_load_contrib * tg->shares; + se->avg.load_avg_contrib = div64_u64(contrib, + atomic64_read(&tg->load_avg) + 1); + + /* + * For group entities we need to compute a correction term in the case + * that they are consuming <1 cpu so that we would contribute the same + * load as a task of equal weight. + * + * Explicitly co-ordinating this measurement would be expensive, but + * fortunately the sum of each cpus contribution forms a usable + * lower-bound on the true value. + * + * Consider the aggregate of 2 contributions. Either they are disjoint + * (and the sum represents true value) or they are disjoint and we are + * understating by the aggregate of their overlap. + * + * Extending this to N cpus, for a given overlap, the maximum amount we + * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of + * cpus that overlap for this interval and w_i is the interval width. + * + * On a small machine; the first term is well-bounded which bounds the + * total error since w_i is a subset of the period. Whereas on a + * larger machine, while this first term can be larger, if w_i is the + * of consequential size guaranteed to see n_i*w_i quickly converge to + * our upper bound of 1-cpu. + */ + runnable_avg = atomic_read(&tg->runnable_avg); + if (runnable_avg < NICE_0_LOAD) { + se->avg.load_avg_contrib *= runnable_avg; + se->avg.load_avg_contrib >>= NICE_0_SHIFT; + } +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} +#endif + +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_contrib = scale_load(contrib); + trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib); + contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_ratio = scale_load(contrib); + trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio); +} + +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (entity_is_task(se)) { + __update_task_entity_contrib(se); + } else { + __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); + __update_group_entity_contrib(se); + } + + return se->avg.load_avg_contrib - old_contrib; +} + +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + u64 now; + + /* + * For a group entity we need to use their owned cfs_rq_clock_task() in + * case they are the parent of a throttled hierarchy. + */ + if (entity_is_task(se)) + now = cfs_rq_clock_task(cfs_rq); + else + now = cfs_rq_clock_task(group_cfs_rq(se)); + + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +{ + u64 now = cfs_rq_clock_task(cfs_rq) >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays && !force_update) + return; + + if (atomic64_read(&cfs_rq->removed_load)) { + u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + subtract_blocked_load_contrib(cfs_rq, removed_load); + } + + if (decays) { + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + cfs_rq->last_decay = now; + } + + __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); + update_cfs_shares(cfs_rq); +} + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + u32 contrib; + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, + runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); + contrib = rq->avg.runnable_avg_sum * scale_load_down(1024); + contrib /= (rq->avg.runnable_avg_period + 1); + trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib)); + trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg); +} + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) +{ + /* + * We track migrations using entity decay_count <= 0, on a wake-up + * migration we use a negative decay count to track the remote decays + * accumulated while sleeping. + */ + if (unlikely(se->avg.decay_count <= 0)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + if (se->avg.decay_count) { + /* + * In a wake-up migration we have to approximate the + * time sleeping. This is because we can't synchronize + * clock_task between the two cpus, and it is not + * guaranteed to be read-safe. Instead, we can + * approximate this using our carried decays, which are + * explicitly atomically readable. + */ + se->avg.last_runnable_update -= (-se->avg.decay_count) + << 20; + update_entity_load_avg(se, 0); + /* Indicate that we're now synchronized and on-rq */ + se->avg.decay_count = 0; + } + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + /* migrated tasks did not contribute to our blocked load */ + if (wakeup) { + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + update_entity_load_avg(se, 0); + } + + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !wakeup); +} + +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) +{ + update_entity_load_avg(se, 1); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !sleep); + + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } /* migrations, e.g. sleep=0 leave decay_count == 0 */ +} +#else +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update) {} +#endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -1096,9 +1499,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1190,9 +1592,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); - se->on_rq = 0; - update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); /* * Normalize the entity after updating the min_vruntime because the @@ -1206,7 +1607,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + se->on_rq = 0; } /* @@ -1261,6 +1662,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -1340,6 +1742,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1353,9 +1757,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_curr(cfs_rq); /* - * Update share accounting for long-running entities. + * Ensure that runnable average is periodically updated. */ - update_entity_shares_tick(cfs_rq); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq, 1); #ifdef CONFIG_SCHED_HRTICK /* @@ -1448,6 +1853,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task; + + return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1592,14 +2006,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { - u64 delta = rq->clock_task - cfs_rq->load_stamp; - - /* leaving throttled state, advance shares averaging windows */ - cfs_rq->load_stamp += delta; - cfs_rq->load_last += delta; - - /* update entity weight now that we are on_rq again */ - update_cfs_shares(cfs_rq); + /* adjust cfs_rq_clock_task() */ + cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task; } #endif @@ -1611,9 +2020,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, record last load */ + /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - update_cfs_load(cfs_rq, 0); + cfs_rq->throttled_clock_task = rq->clock_task; cfs_rq->throttle_count++; return 0; @@ -1628,7 +2037,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* account load preceding throttle */ + /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); @@ -1652,7 +2061,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_timestamp = rq->clock; + cfs_rq->throttled_clock = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -1670,10 +2079,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 0; raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - cfs_rq->throttled_timestamp = 0; update_rq_clock(rq); /* update hierarchical throttle state */ @@ -2052,7 +2460,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -void unthrottle_offline_cfs_rqs(struct rq *rq) +static void unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -2073,8 +2481,13 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2106,7 +2519,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -2207,12 +2620,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } - if (!se) + if (!se) { + update_rq_runnable_avg(rq, rq->nr_running); inc_nr_running(rq); + } hrtick_update(rq); } @@ -2266,12 +2681,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } - if (!se) + if (!se) { dec_nr_running(rq); + update_rq_runnable_avg(rq, 1); + } hrtick_update(rq); } @@ -2637,6 +3054,8 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; + struct sched_group *sg; + int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2653,20 +3072,175 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, check assigned siblings to find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) - continue; - if (idle_cpu(sd->idle_buddy)) - return sd->idle_buddy; - } + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); + } +done: return target; } +#ifdef CONFIG_SCHED_HMP +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * The cpu types are distinguished using a list of hmp_domains + * which each represent one cpu type using a cpumask. + * The list is assumed ordered by compute capacity with the + * fastest domain first. + */ +DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain); + +extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list); + +/* Setup hmp_domains */ +static int __init hmp_cpu_mask_setup(void) +{ + char buf[64]; + struct hmp_domain *domain; + struct list_head *pos; + int dc, cpu; + + pr_debug("Initializing HMP scheduler:\n"); + + /* Initialize hmp_domains using platform code */ + arch_get_hmp_domains(&hmp_domains); + if (list_empty(&hmp_domains)) { + pr_debug("HMP domain list is empty!\n"); + return 0; + } + + /* Print hmp_domains */ + dc = 0; + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + cpulist_scnprintf(buf, 64, &domain->cpus); + pr_debug(" HMP domain %d: %s\n", dc, buf); + + for_each_cpu_mask(cpu, domain->cpus) { + per_cpu(hmp_cpu_domain, cpu) = domain; + } + dc++; + } + + return 1; +} + +/* + * Migration thresholds should be in the range [0..1023] + * hmp_up_threshold: min. load required for migrating tasks to a faster cpu + * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu + * The default values (512, 256) offer good responsiveness, but may need + * tweaking suit particular needs. + * + * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio) + * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms) + * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms) + */ +unsigned int hmp_up_threshold = 512; +unsigned int hmp_down_threshold = 256; +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER +unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL); +#endif +unsigned int hmp_next_up_threshold = 4096; +unsigned int hmp_next_down_threshold = 4096; + +static unsigned int hmp_up_migration(int cpu, struct sched_entity *se); +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); + +/* Check if cpu is in fastest hmp_domain */ +static inline unsigned int hmp_cpu_is_fastest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return pos == hmp_domains.next; +} + +/* Check if cpu is in slowest hmp_domain */ +static inline unsigned int hmp_cpu_is_slowest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_is_last(pos, &hmp_domains); +} + +/* Next (slower) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_slower_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->next, struct hmp_domain, hmp_domains); +} + +/* Previous (faster) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_faster_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->prev, struct hmp_domain, hmp_domains); +} + +/* + * Selects a cpu in previous (faster) hmp_domain + * Note that cpumask_any_and() returns the first cpu in the cpumask + */ +static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk, + int cpu) +{ + return cpumask_any_and(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(tsk)); +} + +/* + * Selects a cpu in next (slower) hmp_domain + * Note that cpumask_any_and() returns the first cpu in the cpumask + */ +static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk, + int cpu) +{ + return cpumask_any_and(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(tsk)); +} + +static inline void hmp_next_up_delay(struct sched_entity *se, int cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + + se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq); + se->avg.hmp_last_down_migration = 0; +} + +static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + + se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq); + se->avg.hmp_last_up_migration = 0; +} +#endif /* CONFIG_SCHED_HMP */ + /* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and @@ -2793,8 +3367,57 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) unlock: rcu_read_unlock(); +#ifdef CONFIG_SCHED_HMP + if (hmp_up_migration(prev_cpu, &p->se)) { + new_cpu = hmp_select_faster_cpu(p, prev_cpu); + hmp_next_up_delay(&p->se, new_cpu); + trace_sched_hmp_migrate(p, new_cpu, 0); + return new_cpu; + } + if (hmp_down_migration(prev_cpu, &p->se)) { + new_cpu = hmp_select_slower_cpu(p, prev_cpu); + hmp_next_down_delay(&p->se, new_cpu); + trace_sched_hmp_migrate(p, new_cpu, 0); + return new_cpu; + } + /* Make sure that the task stays in its previous hmp domain */ + if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus)) + return prev_cpu; +#endif + return new_cpu; } + +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Load tracking: accumulate removed load so that it can be processed + * when we next update owning cfs_rq under rq->lock. Tasks contribute + * to blocked load iff they have a positive decay-count. It can never + * be negative here since on-rq tasks have decay-count == 0. + */ + if (se->avg.decay_count) { + se->avg.decay_count = -__synchronize_entity_decay(se); + atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + } +} +#endif #endif /* CONFIG_SMP */ static unsigned long @@ -3069,6 +3692,9 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + /* The set of CPUs under consideration for load-balancing */ + struct cpumask *cpus; + unsigned int flags; unsigned int loop; @@ -3171,7 +3797,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 1) task is cache cold, or * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -3311,52 +3936,56 @@ next: /* * update tg->load_weight by folding this cpu's load_avg */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) { - struct cfs_rq *cfs_rq; - unsigned long flags; - struct rq *rq; - - if (!tg->se[cpu]) - return 0; - - rq = cpu_rq(cpu); - cfs_rq = tg->cfs_rq[cpu]; + struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - raw_spin_lock_irqsave(&rq->lock, flags); + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + return; - update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); + update_cfs_rq_blocked_load(cfs_rq, 1); - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); - - raw_spin_unlock_irqrestore(&rq->lock, flags); - - return 0; + if (se) { + update_entity_load_avg(se, 1); + /* + * We can pivot on the runnable average decaying to zero for + * list removal since the parent average will always be >= + * child. + */ + if (se->avg.runnable_avg_sum) + update_cfs_shares(cfs_rq); + else + list_del_leaf_cfs_rq(cfs_rq); + } else { + struct rq *rq = rq_of(cfs_rq); + update_rq_runnable_avg(rq, rq->nr_running); + } } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu) { - struct cfs_rq *cfs_rq; struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + unsigned long flags; - rcu_read_lock(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - update_shares_cpu(cfs_rq->tg, cpu); + /* + * Note: We may want to consider periodically releasing + * rq->lock about these updates so that creating many task + * groups does not result in continually extending hold time. + */ + __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); } - rcu_read_unlock(); + + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -3384,6 +4013,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + struct rq *rq = cpu_rq(cpu); + unsigned long now = jiffies; + + if (rq->h_load_throttle == now) + return; + + rq->h_load_throttle = now; + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); rcu_read_unlock(); @@ -3400,7 +4037,7 @@ static unsigned long task_h_load(struct task_struct *p) return load; } #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu) { } @@ -3647,14 +4284,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, const struct cpumask *cpus, - int *balance, struct sg_lb_stats *sgs) + int local_group, int *balance, struct sg_lb_stats *sgs) { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; @@ -3671,7 +4306,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, max_nr_running = 0; min_nr_running = ~0UL; - for_each_cpu_and(i, sched_group_cpus(group), cpus) { + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); nr_running = rq->nr_running; @@ -3795,13 +4430,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, - const struct cpumask *cpus, - int *balance, struct sd_lb_stats *sds) + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -3818,8 +4451,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, - cpus, balance, &sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); if (local_group && !(*balance)) return; @@ -4055,7 +4687,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * to restore balance. * * @env: The load balancing environment. - * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. * @@ -4065,7 +4696,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * -find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) +find_busiest_group(struct lb_env *env, int *balance) { struct sd_lb_stats sds; @@ -4075,7 +4706,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(env, cpus, balance, &sds); + update_sd_lb_stats(env, balance, &sds); /* * this_cpu is not the appropriate cpu to perform load balancing at @@ -4155,8 +4786,7 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq *find_busiest_queue(struct lb_env *env, - struct sched_group *group, - const struct cpumask *cpus) + struct sched_group *group) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -4171,7 +4801,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (!capacity) capacity = fix_small_capacity(env->sd, group); - if (!cpumask_test_cpu(i, cpus)) + if (!cpumask_test_cpu(i, env->cpus)) continue; rq = cpu_rq(i); @@ -4252,6 +4882,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, + .cpus = cpus, }; cpumask_copy(cpus, cpu_active_mask); @@ -4260,7 +4891,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(&env, cpus, balance); + group = find_busiest_group(&env, balance); if (*balance == 0) goto out_balanced; @@ -4270,7 +4901,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(&env, group, cpus); + busiest = find_busiest_queue(&env, group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -4294,11 +4925,10 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); + update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - if (!env.loop) - update_h_load(env.src_cpu); /* * cur_ld_moved - load moved in current iteration @@ -4468,12 +5098,14 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; + update_rq_runnable_avg(this_rq, 1); + /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ raw_spin_unlock(&this_rq->lock); - update_shares(this_cpu); + update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4733,7 +5365,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; - update_shares(cpu); + update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { @@ -4819,14 +5451,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) if (need_resched()) break; - raw_spin_lock_irq(&this_rq->lock); - update_rq_clock(this_rq); - update_idle_cpu_load(this_rq); - raw_spin_unlock_irq(&this_rq->lock); + rq = cpu_rq(balance_cpu); + + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); rebalance_domains(balance_cpu, CPU_IDLE); - rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; } @@ -4901,6 +5534,264 @@ need_kick: static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif +#ifdef CONFIG_SCHED_HMP +/* Check if task should migrate to a faster cpu */ +static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + u64 now; + + if (hmp_cpu_is_fastest(cpu)) + return 0; + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if (p->prio >= hmp_up_prio) + return 0; +#endif + + /* Let the task load settle before doing another up migration */ + now = cfs_rq_clock_task(cfs_rq); + if (((now - se->avg.hmp_last_up_migration) >> 10) + < hmp_next_up_threshold) + return 0; + + if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(p)) + && se->avg.load_avg_ratio > hmp_up_threshold) { + return 1; + } + return 0; +} + +/* Check if task should migrate to a slower cpu */ +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + u64 now; + + if (hmp_cpu_is_slowest(cpu)) + return 0; + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if (p->prio >= hmp_up_prio) + return 1; +#endif + + /* Let the task load settle before doing another down migration */ + now = cfs_rq_clock_task(cfs_rq); + if (((now - se->avg.hmp_last_down_migration) >> 10) + < hmp_next_down_threshold) + return 0; + + if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p)) + && se->avg.load_avg_ratio < hmp_down_threshold) { + return 1; + } + return 0; +} + +/* + * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + * Ideally this function should be merged with can_migrate_task() to avoid + * redundant code. + */ +static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + int tsk_cache_hot = 0; + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed + */ + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + return 0; + } + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + + return 1; +} + +/* + * move_specific_task tries to move a specific task. + * Returns 1 if successful and 0 otherwise. + * Called with both runqueues locked. + */ +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + struct task_struct *p, *n; + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + continue; + + if (!hmp_can_migrate_task(p, env)) + continue; + /* Check if we found the right task */ + if (p != pm) + continue; + + move_task(p, env); + /* + * Right now, this is only the third place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; + } + return 0; +} + +/* + * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to + * migrate a specific task from one runqueue to another. + * hmp_force_up_migration uses this to push a currently running task + * off a runqueue. + * Based on active_load_balance_stop_cpu and can potentially be merged. + */ +static int hmp_active_task_migration_cpu_stop(void *data) +{ + struct rq *busiest_rq = data; + struct task_struct *p = busiest_rq->migrate_task; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct sched_domain *sd; + + raw_spin_lock_irq(&busiest_rq->lock); + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) { + goto out_unlock; + } + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + goto out_unlock; + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, sd) { + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_specific_task(&env, p)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + rcu_read_unlock(); + double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; +} + +static DEFINE_SPINLOCK(hmp_force_migration); + +/* + * hmp_force_up_migration checks runqueues for tasks that need to + * be actively migrated to a faster cpu. + */ +static void hmp_force_up_migration(int this_cpu) +{ + int cpu; + struct sched_entity *curr; + struct rq *target; + unsigned long flags; + unsigned int force; + struct task_struct *p; + + if (!spin_trylock(&hmp_force_migration)) + return; + for_each_online_cpu(cpu) { + force = 0; + target = cpu_rq(cpu); + raw_spin_lock_irqsave(&target->lock, flags); + curr = target->cfs.curr; + if (!curr || !entity_is_task(curr)) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + p = task_of(curr); + if (hmp_up_migration(cpu, curr)) { + if (!target->active_balance) { + target->active_balance = 1; + target->push_cpu = hmp_select_faster_cpu(p, cpu); + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, target->push_cpu, 1); + hmp_next_up_delay(&p->se, target->push_cpu); + } + } + raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + spin_unlock(&hmp_force_migration); +} +#else +static void hmp_force_up_migration(int this_cpu) { } +#endif /* CONFIG_SCHED_HMP */ + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). @@ -4912,6 +5803,8 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; + hmp_force_up_migration(this_cpu); + rebalance_domains(this_cpu, idle); /* @@ -4950,6 +5843,9 @@ static void rq_online_fair(struct rq *rq) static void rq_offline_fair(struct rq *rq) { update_sysctl(); + + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ @@ -4966,6 +5862,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + update_rq_runnable_avg(rq, 1); } /* @@ -5058,6 +5956,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + } +#endif } /* @@ -5104,11 +6016,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + atomic64_set(&cfs_rq->decay_counter, 1); + atomic64_set(&cfs_rq->removed_load, 0); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct cfs_rq *cfs_rq; /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5140,8 +6057,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + if (!on_rq) { + cfs_rq = cfs_rq_of(&p->se); + p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP + /* + * migrate_task_rq_fair() will have removed our previous + * contribution, but we must synchronize for ongoing future + * decay. + */ + p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif + } } void free_fair_sched_group(struct task_group *tg) @@ -5226,10 +6154,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->tg = tg; cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; @@ -5276,8 +6200,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) + for_each_sched_entity(se) { update_cfs_shares(group_cfs_rq(se)); + /* update contribution to parent */ + update_entity_load_avg(se, 1); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -5331,7 +6258,9 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, - +#ifdef CONFIG_FAIR_GROUP_SCHED + .migrate_task_rq = migrate_task_rq_fair, +#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, @@ -5375,6 +6304,10 @@ __init void init_sched_fair_class(void) zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif + +#ifdef CONFIG_SCHED_HMP + hmp_cpu_mask_setup(); +#endif #endif /* SMP */ } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c..d98ae909e32 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -42,7 +42,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, false) +SCHED_FEAT(ARCH_POWER, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 573e1ca0110..e0b7ba9c040 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -691,6 +691,7 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; + rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -788,6 +789,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest + * the timer run on a CPU which does not service all runqueues, + * potentially leaving other CPUs indefinitely throttled. If + * isolation is really required, the user will turn the throttle + * off to kill the perturbations it causes anyway. Meanwhile, + * this maintains functionality for boot and/or troubleshooting. + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +#endif for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c35a1a7dd4d..1f7f16cd205 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex; struct cfs_rq; struct rt_rq; -static LIST_HEAD(task_groups); +extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH @@ -112,6 +112,8 @@ struct task_group { unsigned long shares; atomic_t load_weight; + atomic64_t load_avg; + atomic_t runnable_avg, usage_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -222,22 +224,29 @@ struct cfs_rq { unsigned int nr_spread_over; #endif +#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ #ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter, removed_load; + u64 last_decay; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED + u32 tg_runnable_contrib, tg_usage_contrib; + u64 tg_load_contrib; +#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * h_load = weight * f(tg) * @@ -245,26 +254,30 @@ struct cfs_rq { * this group. */ unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* - * Maintaining per-cpu shares distribution for group scheduling + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ - unsigned long load_contribution; -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; - u64 throttled_timestamp; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ @@ -374,7 +387,11 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif +#ifdef CONFIG_SMP + unsigned long h_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif @@ -408,6 +425,9 @@ struct rq { int active_balance; int push_cpu; struct cpu_stop_work active_balance_work; +#ifdef CONFIG_SCHED_HMP + struct task_struct *migrate_task; +#endif /* cpu of this runqueue: */ int cpu; int online; @@ -463,6 +483,8 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + + struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) @@ -528,6 +550,12 @@ DECLARE_PER_CPU(int, sd_llc_id); extern int group_balance_cpu(struct sched_group *sg); +#ifdef CONFIG_SCHED_HMP +static LIST_HEAD(hmp_domains); +DECLARE_PER_CPU(struct hmp_domain *, hmp_cpu_domain); +#define hmp_cpu_domain(cpu) (per_cpu(hmp_cpu_domain, (cpu))) +#endif /* CONFIG_SCHED_HMP */ + #endif /* CONFIG_SMP */ #include "stats.h" @@ -1140,7 +1168,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void unthrottle_offline_cfs_rqs(struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd2..da5eb5bed84 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) + if (stop && stop->on_rq) { + stop->se.exec_start = rq->clock_task; return stop; + } return NULL; } @@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { + struct task_struct *curr = rq->curr; + u64 delta_exec; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) @@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) static void set_curr_task_stop(struct rq *rq) { + struct task_struct *stop = rq->stop; + + stop->se.exec_start = rq->clock_task; } static void switched_to_stop(struct rq *rq, struct task_struct *p) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 98f60c5caa1..0a49ee70054 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,14 +1,22 @@ /* * Common SMP CPU bringup/teardown functions */ +#include <linux/cpu.h> #include <linux/err.h> #include <linux/smp.h> #include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> #include <linux/sched.h> +#include <linux/export.h> #include <linux/percpu.h> +#include <linux/kthread.h> +#include <linux/smpboot.h> #include "smpboot.h" +#ifdef CONFIG_SMP + #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -65,3 +73,228 @@ void __init idle_threads_init(void) } } #endif + +#endif /* #ifdef CONFIG_SMP */ + +static LIST_HEAD(hotplug_threads); +static DEFINE_MUTEX(smpboot_threads_lock); + +struct smpboot_thread_data { + unsigned int cpu; + unsigned int status; + struct smp_hotplug_thread *ht; +}; + +enum { + HP_THREAD_NONE = 0, + HP_THREAD_ACTIVE, + HP_THREAD_PARKED, +}; + +/** + * smpboot_thread_fn - percpu hotplug thread loop function + * @void: thread data pointer + * + * Checks for thread stop and park conditions. Calls the necessary + * setup, cleanup, park and unpark functions for the registered + * thread. + * + * Returns 1 when the thread should exit, 0 otherwise. + */ +static int smpboot_thread_fn(void *data) +{ + struct smpboot_thread_data *td = data; + struct smp_hotplug_thread *ht = td->ht; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + preempt_disable(); + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + preempt_enable(); + if (ht->cleanup) + ht->cleanup(td->cpu, cpu_online(td->cpu)); + kfree(td); + return 0; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + preempt_enable(); + if (ht->park && td->status == HP_THREAD_ACTIVE) { + BUG_ON(td->cpu != smp_processor_id()); + ht->park(td->cpu); + td->status = HP_THREAD_PARKED; + } + kthread_parkme(); + /* We might have been woken for stop */ + continue; + } + + BUG_ON(td->cpu != smp_processor_id()); + + /* Check for state change setup */ + switch (td->status) { + case HP_THREAD_NONE: + preempt_enable(); + if (ht->setup) + ht->setup(td->cpu); + td->status = HP_THREAD_ACTIVE; + preempt_disable(); + break; + case HP_THREAD_PARKED: + preempt_enable(); + if (ht->unpark) + ht->unpark(td->cpu); + td->status = HP_THREAD_ACTIVE; + preempt_disable(); + break; + } + + if (!ht->thread_should_run(td->cpu)) { + preempt_enable(); + schedule(); + } else { + set_current_state(TASK_RUNNING); + preempt_enable(); + ht->thread_fn(td->cpu); + } + } +} + +static int +__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + struct smpboot_thread_data *td; + + if (tsk) + return 0; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); + if (!td) + return -ENOMEM; + td->cpu = cpu; + td->ht = ht; + + tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, + ht->thread_comm); + if (IS_ERR(tsk)) { + kfree(td); + return PTR_ERR(tsk); + } + + get_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = tsk; + return 0; +} + +int smpboot_create_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) { + ret = __smpboot_create_thread(cur, cpu); + if (ret) + break; + } + mutex_unlock(&smpboot_threads_lock); + return ret; +} + +static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + kthread_unpark(tsk); +} + +void smpboot_unpark_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) + smpboot_unpark_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) + kthread_park(tsk); +} + +void smpboot_park_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry_reverse(cur, &hotplug_threads, list) + smpboot_park_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) +{ + unsigned int cpu; + + /* We need to destroy also the parked threads of offline cpus */ + for_each_possible_cpu(cpu) { + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) { + kthread_stop(tsk); + put_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = NULL; + } + } +} + +/** + * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Creates and starts the threads on all online cpus. + */ +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + unsigned int cpu; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + for_each_online_cpu(cpu) { + ret = __smpboot_create_thread(plug_thread, cpu); + if (ret) { + smpboot_destroy_threads(plug_thread); + goto out; + } + smpboot_unpark_thread(plug_thread, cpu); + } + list_add(&plug_thread->list, &hotplug_threads); +out: + mutex_unlock(&smpboot_threads_lock); + return ret; +} +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); + +/** + * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Stops all threads on all possible cpus. + */ +void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + list_del(&plug_thread->list); + smpboot_destroy_threads(plug_thread); + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 6ef9433e1c7..72415a0eb95 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { } static inline void idle_threads_init(void) { } #endif +int smpboot_create_threads(unsigned int cpu); +void smpboot_park_threads(unsigned int cpu); +void smpboot_unpark_threads(unsigned int cpu); + #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index b73e681df09..e04e5b28366 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/ftrace.h> #include <linux/smp.h> +#include <linux/smpboot.h> #include <linux/tick.h> #define CREATE_TRACE_POINTS @@ -742,49 +743,22 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int run_ksoftirqd(void * __bind_cpu) +static int ksoftirqd_should_run(unsigned int cpu) { - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { - preempt_disable(); - if (!local_softirq_pending()) { - schedule_preempt_disabled(); - } - - __set_current_state(TASK_RUNNING); - - while (local_softirq_pending()) { - /* Preempt disable stops cpu going offline. - If already offline, we'll be on wrong CPU: - don't process */ - if (cpu_is_offline((long)__bind_cpu)) - goto wait_to_die; - local_irq_disable(); - if (local_softirq_pending()) - __do_softirq(); - local_irq_enable(); - sched_preempt_enable_no_resched(); - cond_resched(); - preempt_disable(); - rcu_note_context_switch((long)__bind_cpu); - } - preempt_enable(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; + return local_softirq_pending(); +} -wait_to_die: - preempt_enable(); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); +static void run_ksoftirqd(unsigned int cpu) +{ + local_irq_disable(); + if (local_softirq_pending()) { + __do_softirq(); + rcu_note_context_switch(cpu); + local_irq_enable(); + cond_resched(); + return; } - __set_current_state(TASK_RUNNING); - return 0; + local_irq_enable(); } #ifdef CONFIG_HOTPLUG_CPU @@ -850,50 +824,17 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create_on_node(run_ksoftirqd, - hcpu, - cpu_to_node(hotcpu), - "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return notifier_from_errno(PTR_ERR(p)); - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); - break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { - static const struct sched_param param = { - .sched_priority = MAX_RT_PRIO-1 - }; - - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); + int hotcpu = (unsigned long)hcpu; + takeover_tasklets(hotcpu); break; } #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -901,14 +842,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; +static struct smp_hotplug_thread softirq_threads = { + .store = &ksoftirqd, + .thread_should_run = ksoftirqd_should_run, + .thread_fn = run_ksoftirqd, + .thread_comm = "ksoftirqd/%u", +}; + static __init int spawn_ksoftirqd(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - - BUG_ON(err != NOTIFY_OK); - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); + + BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); + return 0; } early_initcall(spawn_ksoftirqd); diff --git a/kernel/task_work.c b/kernel/task_work.c index 91d4e1742a0..d320d44903b 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -75,6 +75,7 @@ void task_work_run(void) p = q->next; q->func(q); q = p; + cond_resched(); } } } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a470154e040..46da0537c10 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b7fbadc5c97..24174b4d669 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; -/* ACTHZ period (nsecs): */ +/* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; static u64 tick_length; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f7..3a9e5d5c109 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) tick_do_update_jiffies64(now); update_cpu_load_nohz(); + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f045cc50832..d3b91e75cec 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -65,14 +65,14 @@ struct timekeeper { * used instead. */ struct timespec wall_to_monotonic; - /* time spent in suspend */ - struct timespec total_sleep_time; - /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; /* Offset clock monotonic -> clock realtime */ ktime_t offs_real; + /* time spent in suspend */ + struct timespec total_sleep_time; /* Offset clock monotonic -> clock boottime */ ktime_t offs_boot; + /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ + struct timespec raw_time; /* Seqlock for all timekeeper values */ seqlock_t lock; }; @@ -108,13 +108,39 @@ static struct timespec tk_xtime(struct timekeeper *tk) static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec = ts->tv_sec; - tk->xtime_nsec = ts->tv_nsec << tk->shift; + tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec += ts->tv_sec; - tk->xtime_nsec += ts->tv_nsec << tk->shift; + tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk_normalize_xtime(tk); +} + +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +{ + struct timespec tmp; + + /* + * Verify consistency of: offset_real = -wall_to_monotonic + * before modifying anything + */ + set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + -tk->wall_to_monotonic.tv_nsec); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + tk->wall_to_monotonic = wtm; + set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec_to_ktime(tmp); +} + +static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +{ + /* Verify consistency before modifying */ + WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); + + tk->total_sleep_time = t; + tk->offs_boot = timespec_to_ktime(t); } /** @@ -217,14 +243,6 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } -static void update_rt_offset(struct timekeeper *tk) -{ - struct timespec tmp, *wtm = &tk->wall_to_monotonic; - - set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); - tk->offs_real = timespec_to_ktime(tmp); -} - /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { @@ -234,12 +252,10 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) tk->ntp_error = 0; ntp_clear(); } - update_rt_offset(tk); xt = tk_xtime(tk); update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); } - /** * timekeeping_forward_now - update clock to the current time * @@ -261,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in gettimeoffset() */ - tk->xtime_nsec += arch_gettimeoffset() << tk->shift; + tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; tk_normalize_xtime(tk); @@ -277,38 +293,39 @@ static void timekeeping_forward_now(struct timekeeper *tk) */ void getnstimeofday(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs = 0; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); + ts->tv_sec = tk->xtime_sec; + nsecs = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); + ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { + struct timekeeper *tk = &timekeeper; unsigned int seq; s64 secs, nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - secs = timekeeper.xtime_sec + - timekeeper.wall_to_monotonic.tv_sec; - nsecs = timekeeping_get_ns(&timekeeper) + - timekeeper.wall_to_monotonic.tv_nsec; + seq = read_seqbegin(&tk->lock); + secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -327,21 +344,24 @@ EXPORT_SYMBOL_GPL(ktime_get); */ void ktime_get_ts(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono; + s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); - tomono = timekeeper.wall_to_monotonic; + seq = read_seqbegin(&tk->lock); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); @@ -358,22 +378,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); */ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs_raw, nsecs_real; WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - *ts_raw = timekeeper.raw_time; - ts_real->tv_sec = timekeeper.xtime_sec; + *ts_raw = tk->raw_time; + ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; - nsecs_raw = timekeeping_get_ns_raw(&timekeeper); - nsecs_real = timekeeping_get_ns(&timekeeper); + nsecs_raw = timekeeping_get_ns_raw(tk); + nsecs_real = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -406,28 +427,28 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { + struct timekeeper *tk = &timekeeper; struct timespec ts_delta, xt; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid_strict(tv)) return -EINVAL; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); - xt = tk_xtime(&timekeeper); + xt = tk_xtime(tk); ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, ts_delta); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); - tk_set_xtime(&timekeeper, tv); + tk_set_xtime(tk, tv); - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -436,7 +457,6 @@ int do_settimeofday(const struct timespec *tv) } EXPORT_SYMBOL(do_settimeofday); - /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset @@ -445,28 +465,37 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long flags; + struct timespec tmp; + int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); + /* Make sure the proposed value is valid */ + tmp = timespec_add(tk_xtime(tk), *ts); + if (!timespec_valid_strict(&tmp)) { + ret = -EINVAL; + goto error; + } - tk_xtime_add(&timekeeper, ts); - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, *ts); + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); - timekeeping_update(&timekeeper, true); +error: /* even if we error out, we forwarded the time, so call update */ + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(timekeeping_inject_offset); @@ -477,23 +506,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset); */ static int change_clocksource(void *data) { + struct timekeeper *tk = &timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { - old = timekeeper.clock; - tk_setup_internals(&timekeeper, new); + old = tk->clock; + tk_setup_internals(tk, new); if (old->disable) old->disable(old); } - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); return 0; } @@ -507,7 +537,9 @@ static int change_clocksource(void *data) */ void timekeeping_notify(struct clocksource *clock) { - if (timekeeper.clock == clock) + struct timekeeper *tk = &timekeeper; + + if (tk->clock == clock) return; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); @@ -536,35 +568,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real); */ void getrawmonotonic(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs; do { - seq = read_seqbegin(&timekeeper.lock); - nsecs = timekeeping_get_ns_raw(&timekeeper); - *ts = timekeeper.raw_time; + seq = read_seqbegin(&tk->lock); + nsecs = timekeeping_get_ns_raw(tk); + *ts = tk->raw_time; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic); - /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; int ret; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); return ret; } @@ -574,15 +607,16 @@ int timekeeping_valid_for_hres(void) */ u64 timekeeping_max_deferment(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; u64 ret; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ret = timekeeper.clock->max_idle_ns; + ret = tk->clock->max_idle_ns; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); return ret; } @@ -622,46 +656,56 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts) */ void __init timekeeping_init(void) { + struct timekeeper *tk = &timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec now, boot; + struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid_strict(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } + read_boot_clock(&boot); + if (!timespec_valid_strict(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } - seqlock_init(&timekeeper.lock); + seqlock_init(&tk->lock); ntp_init(); - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - tk_setup_internals(&timekeeper, clock); + tk_setup_internals(tk, clock); - tk_set_xtime(&timekeeper, &now); - timekeeper.raw_time.tv_sec = 0; - timekeeper.raw_time.tv_nsec = 0; + tk_set_xtime(tk, &now); + tk->raw_time.tv_sec = 0; + tk->raw_time.tv_nsec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) - boot = tk_xtime(&timekeeper); - - set_normalized_timespec(&timekeeper.wall_to_monotonic, - -boot.tv_sec, -boot.tv_nsec); - update_rt_offset(&timekeeper); - timekeeper.total_sleep_time.tv_sec = 0; - timekeeper.total_sleep_time.tv_nsec = 0; - write_sequnlock_irqrestore(&timekeeper.lock, flags); + boot = tk_xtime(tk); + + set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + tk_set_wall_to_mono(tk, tmp); + + tmp.tv_sec = 0; + tmp.tv_nsec = 0; + tk_set_sleep_time(tk, tmp); + + write_sequnlock_irqrestore(&tk->lock, flags); } /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; -static void update_sleep_time(struct timespec t) -{ - timekeeper.total_sleep_time = t; - timekeeper.offs_boot = timespec_to_ktime(t); -} - /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value @@ -672,18 +716,16 @@ static void update_sleep_time(struct timespec t) static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { - if (!timespec_valid(delta)) { + if (!timespec_valid_strict(delta)) { printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } - tk_xtime_add(tk, delta); - tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); - update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); + tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); } - /** * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec delta value @@ -696,6 +738,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, */ void timekeeping_inject_sleeptime(struct timespec *delta) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; @@ -704,21 +747,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) return; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); - __timekeeping_inject_sleeptime(&timekeeper, delta); + __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); } - /** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @@ -728,6 +770,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; @@ -735,18 +778,18 @@ static void timekeeping_resume(void) clocksource_resume(); - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(&timekeeper, &ts); + __timekeeping_inject_sleeptime(tk, &ts); } /* re-base the last cycle value */ - timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); - timekeeper.ntp_error = 0; + tk->clock->cycle_last = tk->clock->read(tk->clock); + tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(&timekeeper, false); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + timekeeping_update(tk, false); + write_sequnlock_irqrestore(&tk->lock, flags); touch_softlockup_watchdog(); @@ -758,14 +801,15 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec delta, delta_delta; static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(&timekeeper); + write_seqlock_irqsave(&tk->lock, flags); + timekeeping_forward_now(tk); timekeeping_suspended = 1; /* @@ -774,7 +818,7 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); + delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); delta_delta = timespec_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -787,7 +831,7 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -898,27 +942,29 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * the error. This causes the likely below to be unlikely. * * The proper fix is to avoid rounding up by using - * the high precision timekeeper.xtime_nsec instead of + * the high precision tk->xtime_nsec instead of * xtime.tv_nsec everywhere. Fixing this will take some * time. */ if (likely(error <= interval)) adj = 1; else - adj = timekeeping_bigadjust(tk, error, &interval, - &offset); - } else if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = timekeeping_bigadjust(tk, error, &interval, - &offset); - } else - return; + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } else { + if (error < -interval) { + /* See comment above, this is just switched for the negative */ + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else { + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } + } else { + goto out_adjust; + } + } if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { @@ -981,6 +1027,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) tk->xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +out_adjust: /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource @@ -1003,7 +1050,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) } - /** * accumulate_nsecs_to_secs - Accumulates nsecs into secs * @@ -1024,15 +1070,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); - tk->xtime_sec += leap; - tk->wall_to_monotonic.tv_sec -= leap; - if (leap) - clock_was_set_delayed(); + if (unlikely(leap)) { + struct timespec ts; + + tk->xtime_sec += leap; + + ts.tv_sec = leap; + ts.tv_nsec = 0; + tk_set_wall_to_mono(tk, + timespec_sub(tk->wall_to_monotonic, ts)); + clock_was_set_delayed(); + } } } - /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -1076,7 +1128,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, return offset; } - /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -1084,25 +1135,30 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, static void update_wall_time(void) { struct clocksource *clock; + struct timekeeper *tk = &timekeeper; cycle_t offset; int shift = 0, maxshift; unsigned long flags; s64 remainder; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) goto out; - clock = timekeeper.clock; + clock = tk->clock; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = timekeeper.cycle_interval; + offset = tk->cycle_interval; #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif + /* Check if there's really nothing to do */ + if (offset < tk->cycle_interval) + goto out; + /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, @@ -1111,19 +1167,19 @@ static void update_wall_time(void) * chunk in one go, and then try to consume the next smaller * doubled multiple. */ - shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); - while (offset >= timekeeper.cycle_interval) { - offset = logarithmic_accumulation(&timekeeper, offset, shift); - if(offset < timekeeper.cycle_interval<<shift) + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift); + if (offset < tk->cycle_interval<<shift) shift--; } /* correct the clock when NTP error is too big */ - timekeeping_adjust(&timekeeper, offset); + timekeeping_adjust(tk, offset); /* @@ -1135,21 +1191,21 @@ static void update_wall_time(void) * the vsyscall implementations are converted to use xtime_nsec * (shifted nanoseconds), this can be killed. */ - remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1); - timekeeper.xtime_nsec -= remainder; - timekeeper.xtime_nsec += 1 << timekeeper.shift; - timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift; + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1ULL << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - accumulate_nsecs_to_secs(&timekeeper); + accumulate_nsecs_to_secs(tk); - timekeeping_update(&timekeeper, false); + timekeeping_update(tk, false); out: - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); } @@ -1166,18 +1222,18 @@ out: */ void getboottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec boottime = { - .tv_sec = timekeeper.wall_to_monotonic.tv_sec + - timekeeper.total_sleep_time.tv_sec, - .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + - timekeeper.total_sleep_time.tv_nsec + .tv_sec = tk->wall_to_monotonic.tv_sec + + tk->total_sleep_time.tv_sec, + .tv_nsec = tk->wall_to_monotonic.tv_nsec + + tk->total_sleep_time.tv_nsec }; set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } EXPORT_SYMBOL_GPL(getboottime); - /** * get_monotonic_boottime - Returns monotonic time since boot * @ts: pointer to the timespec to be set @@ -1189,22 +1245,25 @@ EXPORT_SYMBOL_GPL(getboottime); */ void get_monotonic_boottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono, sleep; + s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); - tomono = timekeeper.wall_to_monotonic; - sleep = timekeeper.total_sleep_time; + seq = read_seqbegin(&tk->lock); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; + sleep = tk->total_sleep_time; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, - ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); + ts->tv_sec += tomono.tv_sec + sleep.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); } EXPORT_SYMBOL_GPL(get_monotonic_boottime); @@ -1231,31 +1290,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add(*ts, timekeeper.total_sleep_time); + struct timekeeper *tk = &timekeeper; + + *ts = timespec_add(*ts, tk->total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return timekeeper.xtime_sec; + struct timekeeper *tk = &timekeeper; + + return tk->xtime_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return tk_xtime(&timekeeper); + struct timekeeper *tk = &timekeeper; + + return tk_xtime(tk); } struct timespec current_kernel_time(void) { + struct timekeeper *tk = &timekeeper; struct timespec now; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - now = tk_xtime(&timekeeper); - } while (read_seqretry(&timekeeper.lock, seq)); + now = tk_xtime(tk); + } while (read_seqretry(&tk->lock, seq)); return now; } @@ -1263,15 +1329,16 @@ EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { + struct timekeeper *tk = &timekeeper; struct timespec now, mono; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - now = tk_xtime(&timekeeper); - mono = timekeeper.wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + now = tk_xtime(tk); + mono = tk->wall_to_monotonic; + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1300,14 +1367,15 @@ void do_timer(unsigned long ticks) void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, struct timespec *wtom, struct timespec *sleep) { + struct timekeeper *tk = &timekeeper; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); - *xtim = tk_xtime(&timekeeper); - *wtom = timekeeper.wall_to_monotonic; - *sleep = timekeeper.total_sleep_time; - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + *xtim = tk_xtime(tk); + *wtom = tk->wall_to_monotonic; + *sleep = tk->total_sleep_time; + } while (read_seqretry(&tk->lock, seq)); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1321,19 +1389,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, */ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) { + struct timekeeper *tk = &timekeeper; ktime_t now; unsigned int seq; u64 secs, nsecs; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - secs = timekeeper.xtime_sec; - nsecs = timekeeping_get_ns(&timekeeper); + secs = tk->xtime_sec; + nsecs = timekeeping_get_ns(tk); - *offs_real = timekeeper.offs_real; - *offs_boot = timekeeper.offs_boot; - } while (read_seqretry(&timekeeper.lock, seq)); + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + } while (read_seqretry(&tk->lock, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1346,19 +1415,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) */ ktime_t ktime_get_monotonic_offset(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; struct timespec wtom; do { - seq = read_seqbegin(&timekeeper.lock); - wtom = timekeeper.wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + wtom = tk->wall_to_monotonic; + } while (read_seqretry(&tk->lock, seq)); return timespec_to_ktime(wtom); } EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - /** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. diff --git a/kernel/timer.c b/kernel/timer.c index a61c09374eb..8c5e7b908c6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - /** * sys_getpid - return the thread group id of the current process * @@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid) return from_kgid_munged(current_user_ns(), current_egid()); } -#endif - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752ae8f..8a6d2ee2086 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, - 1, ®s, head); + 1, ®s, head, NULL); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b31d3d5699f..1a2117043bb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ip, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ret_ip, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 96fc7336909..6b245f64c8d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -532,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysenter_enable(struct ftrace_event_call *call) @@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; @@ -608,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->ret = syscall_get_return_value(current, regs); head = this_cpu_ptr(sys_data->exit_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysexit_enable(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2b36ac68549..03003cd7dd9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); out: preempt_enable(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 69add8a9da6..9d4c8d5a1f5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -22,6 +22,7 @@ #include <linux/notifier.h> #include <linux/module.h> #include <linux/sysctl.h> +#include <linux/smpboot.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -29,16 +30,18 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_disabled; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); #ifdef CONFIG_HARDLOCKUP_DETECTOR static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif @@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event, __this_cpu_write(hard_watchdog_warn, false); return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -#else -static inline void watchdog_interrupt_count(void) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +static int watchdog_nmi_enable(unsigned int cpu); +static void watchdog_nmi_disable(unsigned int cpu); /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) @@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } +static void watchdog_set_prio(unsigned int policy, unsigned int prio) +{ + struct sched_param param = { .sched_priority = prio }; -/* - * The watchdog thread - touches the timestamp. - */ -static int watchdog(void *unused) + sched_setscheduler(current, policy, ¶m); +} + +static void watchdog_enable(unsigned int cpu) { - struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - /* initialize timestamp */ - __touch_watchdog(); + if (!watchdog_enabled) { + kthread_park(current); + return; + } + + /* Enable the perf event */ + watchdog_nmi_enable(cpu); /* kick off the timer for the hardlockup detector */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + /* done here because hrtimer_start can only pin to smp_processor_id() */ hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), HRTIMER_MODE_REL_PINNED); - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly (kicked by the hrtimer callback function) once every - * get_sample_period() seconds (4 seconds by default) to reset the - * softlockup timestamp. If this gets delayed for more than - * 2*watchdog_thresh seconds then the debug-printout triggers in - * watchdog_timer_fn(). - */ - while (!kthread_should_stop()) { - __touch_watchdog(); - schedule(); + /* initialize timestamp */ + watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); + __touch_watchdog(); +} + +static void watchdog_disable(unsigned int cpu) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - if (kthread_should_stop()) - break; + watchdog_set_prio(SCHED_NORMAL, 0); + hrtimer_cancel(hrtimer); + /* disable the perf event */ + watchdog_nmi_disable(cpu); +} - set_current_state(TASK_INTERRUPTIBLE); - } - /* - * Drop the policy/priority elevation during thread exit to avoid a - * scheduling latency spike. - */ - __set_current_state(TASK_RUNNING); - sched_setscheduler(current, SCHED_NORMAL, ¶m); - return 0; +static int watchdog_should_run(unsigned int cpu) +{ + return __this_cpu_read(hrtimer_interrupts) != + __this_cpu_read(soft_lockup_hrtimer_cnt); } +/* + * The watchdog thread function - touches the timestamp. + * + * It only runs once every get_sample_period() seconds (4 seconds by + * default) to reset the softlockup timestamp. If this gets delayed + * for more than 2*watchdog_thresh seconds then the debug-printout + * triggers in watchdog_timer_fn(). + */ +static void watchdog(unsigned int cpu) +{ + __this_cpu_write(soft_lockup_hrtimer_cnt, + __this_cpu_read(hrtimer_interrupts)); + __touch_watchdog(); +} #ifdef CONFIG_HARDLOCKUP_DETECTOR /* @@ -379,7 +403,7 @@ static int watchdog(void *unused) */ static unsigned long cpu0_err; -static int watchdog_nmi_enable(int cpu) +static int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -433,7 +457,7 @@ out: return 0; } -static void watchdog_nmi_disable(int cpu) +static void watchdog_nmi_disable(unsigned int cpu) { struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu) return; } #else -static int watchdog_nmi_enable(int cpu) { return 0; } -static void watchdog_nmi_disable(int cpu) { return; } +static int watchdog_nmi_enable(unsigned int cpu) { return 0; } +static void watchdog_nmi_disable(unsigned int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* prepare/enable/disable routines */ -static void watchdog_prepare_cpu(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - WARN_ON(per_cpu(softlockup_watchdog, cpu)); - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; -} - -static int watchdog_enable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err = 0; - - /* enable the perf event */ - err = watchdog_nmi_enable(cpu); - - /* Regardless of err above, fall through and start softlockup */ - - /* create the watchdog thread */ - if (!p) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); - if (IS_ERR(p)) { - pr_err("softlockup watchdog for %i failed\n", cpu); - if (!err) { - /* if hardlockup hasn't already set this */ - err = PTR_ERR(p); - /* and disable the perf event */ - watchdog_nmi_disable(cpu); - } - goto out; - } - sched_setscheduler(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - per_cpu(watchdog_touch_ts, cpu) = 0; - per_cpu(softlockup_watchdog, cpu) = p; - wake_up_process(p); - } - -out: - return err; -} - -static void watchdog_disable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - /* - * cancel the timer first to stop incrementing the stats - * and waking up the kthread - */ - hrtimer_cancel(hrtimer); - - /* disable the perf event */ - watchdog_nmi_disable(cpu); - - /* stop the watchdog thread */ - if (p) { - per_cpu(softlockup_watchdog, cpu) = NULL; - kthread_stop(p); - } -} - /* sysctl functions */ #ifdef CONFIG_SYSCTL static void watchdog_enable_all_cpus(void) { - int cpu; - - watchdog_enabled = 0; - - for_each_online_cpu(cpu) - if (!watchdog_enable(cpu)) - /* if any cpu succeeds, watchdog is considered - enabled for the system */ - watchdog_enabled = 1; - - if (!watchdog_enabled) - pr_err("failed to be enabled on some cpus\n"); + unsigned int cpu; + if (watchdog_disabled) { + watchdog_disabled = 0; + for_each_online_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + } } static void watchdog_disable_all_cpus(void) { - int cpu; + unsigned int cpu; - for_each_online_cpu(cpu) - watchdog_disable(cpu); - - /* if all watchdogs are disabled, then they are disabled for the system */ - watchdog_enabled = 0; + if (!watchdog_disabled) { + watchdog_disabled = 1; + for_each_online_cpu(cpu) + kthread_park(per_cpu(softlockup_watchdog, cpu)); + } } - /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ @@ -557,90 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write, { int ret; + if (watchdog_disabled < 0) + return -ENODEV; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) - goto out; + return ret; if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); -out: return ret; } #endif /* CONFIG_SYSCTL */ - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - watchdog_prepare_cpu(hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (watchdog_enabled) - watchdog_enable(hotcpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - watchdog_disable(hotcpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - watchdog_disable(hotcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - - /* - * hardlockup and softlockup are not important enough - * to block cpu bring up. Just always succeed and - * rely on printk output to flag problems. - */ - return NOTIFY_OK; -} - -static struct notifier_block cpu_nfb = { - .notifier_call = cpu_callback +static struct smp_hotplug_thread watchdog_threads = { + .store = &softlockup_watchdog, + .thread_should_run = watchdog_should_run, + .thread_fn = watchdog, + .thread_comm = "watchdog/%u", + .setup = watchdog_enable, + .park = watchdog_disable, + .unpark = watchdog_enable, }; -#ifdef CONFIG_SUSPEND -/* - * On exit from suspend we force an offline->online transition on the boot CPU - * so that the PMU state that was lost while in suspended state gets set up - * properly for the boot CPU. This information is required for restarting the - * NMI watchdog. - */ -void lockup_detector_bootcpu_resume(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - - cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu); - cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu); -} -#endif - void __init lockup_detector_init(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(notifier_to_errno(err)); - - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - return; + if (smpboot_register_percpu_thread(&watchdog_threads)) { + pr_err("Failed to create watchdog threads, disabled\n"); + watchdog_disabled = -ENODEV; + } } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a1..3c5a79e2134 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,6 +66,7 @@ enum { /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_mutex); + bool managing = pool->flags & POOL_MANAGING_WORKERS; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker) /* we did our part, wait for rebind_workers() to finish up */ wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); + + /* + * rebind_workers() shouldn't finish until all workers passed the + * above WORKER_REBIND wait. Tell it when done. + */ + spin_lock_irq(&worker->pool->gcwq->lock); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); } /* @@ -1339,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work) struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); + worker_maybe_bind_and_lock(worker); + + /* + * %WORKER_REBIND must be cleared even if the above binding failed; + * otherwise, we may confuse the next CPU_UP cycle or oops / get + * stuck by calling idle_worker_rebind() prematurely. If CPU went + * down again inbetween, %WORKER_UNBOUND would be set, so clearing + * %WORKER_REBIND is always safe. + */ + worker_clr_flags(worker, WORKER_REBIND); spin_unlock_irq(&gcwq->lock); } @@ -1396,12 +1414,15 @@ retry: /* set REBIND and kick idle ones, we'll wait for these later */ for_each_worker_pool(pool, gcwq) { list_for_each_entry(worker, &pool->idle_list, entry) { + unsigned long worker_flags = worker->flags; + if (worker->flags & WORKER_REBIND) continue; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; idle_rebind.cnt++; worker->idle_rebind = &idle_rebind; @@ -1419,25 +1440,15 @@ retry: goto retry; } - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags &= ~WORKER_REBIND; - - wake_up_all(&gcwq->rebind_hold); - - /* rebind busy workers */ + /* all idle workers are rebound, rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; + unsigned long worker_flags = worker->flags; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(rebind_work))) @@ -1449,6 +1460,34 @@ retry: worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + * + * We need to make sure all idle workers passed WORKER_REBIND wait + * in idle_worker_rebind() before returning; otherwise, workers can + * get stuck at the wait if hotplug cycle repeats. + */ + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { + worker->flags &= ~WORKER_REBIND; + idle_rebind.cnt++; + } + } + + wake_up_all(&gcwq->rebind_hold); + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + } } static struct worker *alloc_worker(void) @@ -1794,9 +1833,45 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (!mutex_trylock(&pool->manager_mutex)) + if (pool->flags & POOL_MANAGING_WORKERS) return ret; + pool->flags |= POOL_MANAGING_WORKERS; + + /* + * To simplify both worker management and CPU hotplug, hold off + * management while hotplug is in progress. CPU hotplug path can't + * grab %POOL_MANAGING_WORKERS to achieve this because that can + * lead to idle worker depletion (all become busy thinking someone + * else is managing) which in turn can result in deadlock under + * extreme circumstances. Use @pool->manager_mutex to synchronize + * manager against CPU hotplug. + * + * manager_mutex would always be free unless CPU hotplug is in + * progress. trylock first without dropping @gcwq->lock. + */ + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { + spin_unlock_irq(&pool->gcwq->lock); + mutex_lock(&pool->manager_mutex); + /* + * CPU hotplug could have happened while we were waiting + * for manager_mutex. Hotplug itself can't handle us + * because manager isn't either on idle or busy list, and + * @gcwq's state and ours could have deviated. + * + * As hotplug is now excluded via manager_mutex, we can + * simply try to bind. It will succeed or fail depending + * on @gcwq's current state. Try it and adjust + * %WORKER_UNBOUND accordingly. + */ + if (worker_maybe_bind_and_lock(worker)) + worker->flags &= ~WORKER_UNBOUND; + else + worker->flags |= WORKER_UNBOUND; + + ret = true; + } + pool->flags &= ~POOL_MANAGE_WORKERS; /* @@ -1806,6 +1881,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); + pool->flags &= ~POOL_MANAGING_WORKERS; mutex_unlock(&pool->manager_mutex); return ret; } @@ -3500,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, #ifdef CONFIG_SMP struct work_for_cpu { - struct completion completion; + struct work_struct work; long (*fn)(void *); void *arg; long ret; }; -static int do_work_for_cpu(void *_wfc) +static void work_for_cpu_fn(struct work_struct *work) { - struct work_for_cpu *wfc = _wfc; + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + wfc->ret = wfc->fn(wfc->arg); - complete(&wfc->completion); - return 0; } /** @@ -3526,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc) */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { - struct task_struct *sub_thread; - struct work_for_cpu wfc = { - .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), - .fn = fn, - .arg = arg, - }; + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); - if (IS_ERR(sub_thread)) - return PTR_ERR(sub_thread); - kthread_bind(sub_thread, cpu); - wake_up_process(sub_thread); - wait_for_completion(&wfc.completion); + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); |