From 1d1e97562e5e2ac60fb7b25437ba619f95f67fab Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Thu, 26 Feb 2009 18:27:38 -0600 Subject: keys: distinguish per-uid keys in different namespaces per-uid keys were looked by uid only. Use the user namespace to distinguish the same uid in different namespaces. This does not address key_permission. So a task can for instance try to join a keyring owned by the same uid in another namespace. That will be handled by a separate patch. Signed-off-by: Serge E. Hallyn Acked-by: David Howells Signed-off-by: James Morris --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 477b6660f44..d8b332c3ae3 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -20,7 +20,7 @@ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(1), + .refcount = ATOMIC_INIT(2), }, .creator = &root_user, }; -- cgit v1.2.3 From 64ca5ab913f1594ef316556e65f5eae63ff50cee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2009 12:11:56 -0800 Subject: rcu: increment quiescent state counter in ksoftirqd() If a machine is flooded by network frames, a cpu can loop 100% of its time inside ksoftirqd() without calling schedule(). This can delay RCU grace period to insane values. Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem. Paul: "This regression was a result of the recent change from "schedule()" to "cond_resched()", which got rid of that quiescent state in the common case where a reschedule is not needed". Signed-off-by: Eric Dumazet Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/softirq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8..9041ea7948f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu) preempt_enable_no_resched(); cond_resched(); preempt_disable(); + rcu_qsctr_inc((long)__bind_cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); -- cgit v1.2.3 From 6d5b5acca9e566515ef3f1ed617e7295c4f94345 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 9 Mar 2009 13:31:59 +0100 Subject: Fix fixpoint divide exception in acct_update_integrals Frans Pop reported the crash below when running an s390 kernel under Hercules: Kernel BUG at 000738b4 verbose debug info unavailable! fixpoint divide exception: 0009 #1! SMP Modules linked in: nfs lockd nfs_acl sunrpc ctcm fsm tape_34xx cu3088 tape ccwgroup tape_class ext3 jbd mbcache dm_mirror dm_log dm_snapshot dm_mod dasd_eckd_mod dasd_mod CPU: 0 Not tainted 2.6.27.19 #13 Process awk (pid: 2069, task: 0f9ed9b8, ksp: 0f4f7d18) Krnl PSW : 070c1000 800738b4 (acct_update_integrals+0x4c/0x118) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 Krnl GPRS: 00000000 000007d0 7fffffff fffff830 00000000 ffffffff 00000002 0f9ed9b8 00000000 00008ca0 00000000 0f9ed9b8 0f9edda4 8007386e 0f4f7ec8 0f4f7e98 Krnl Code: 800738aa: a71807d0 lhi %r1,2000 800738ae: 8c200001 srdl %r2,1 800738b2: 1d21 dr %r2,%r1 >800738b4: 5810d10e l %r1,270(%r13) 800738b8: 1823 lr %r2,%r3 800738ba: 4130f060 la %r3,96(%r15) 800738be: 0de1 basr %r14,%r1 800738c0: 5800f060 l %r0,96(%r15) Call Trace: ( <000000000004fdea>! blocking_notifier_call_chain+0x1e/0x2c) <0000000000038502>! do_exit+0x106/0x7c0 <0000000000038c36>! do_group_exit+0x7a/0xb4 <0000000000038c8e>! SyS_exit_group+0x1e/0x30 <0000000000021c28>! sysc_do_restart+0x12/0x16 <0000000077e7e924>! 0x77e7e924 Reason for this is that cpu time accounting usually only happens from interrupt context, but acct_update_integrals gets also called from process context with interrupts enabled. So in acct_update_integrals we may end up with the following scenario: Between reading tsk->stime/tsk->utime and tsk->acct_timexpd an interrupt happens which updates accouting values. This causes acct_timexpd to be greater than the former stime + utime. The subsequent calculation of dtime = cputime_sub(time, tsk->acct_timexpd); will be negative and the division performed by cputime_to_jiffies(dtime) will generate an exception since the result won't fit into a 32 bit register. In order to fix this just always disable interrupts while accessing any of the accounting values. Reported by: Frans Pop Tested by: Frans Pop Cc: stable@kernel.org Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 43f891b05a4..00d59d048ed 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk) if (likely(tsk->mm)) { cputime_t time, dtime; struct timeval value; + unsigned long flags; u64 delta; + local_irq_save(flags); time = tsk->stime + tsk->utime; dtime = cputime_sub(time, tsk->acct_timexpd); jiffies_to_timeval(cputime_to_jiffies(dtime), &value); @@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk) delta = delta * USEC_PER_SEC + value.tv_usec; if (delta == 0) - return; + goto out; tsk->acct_timexpd = time; tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + out: + local_irq_restore(flags); } } -- cgit v1.2.3 From 2d5516cbb9daf7d0e342a2e3b0fc6f8c39a81205 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 2 Mar 2009 22:58:45 +0100 Subject: copy_process: fix CLONE_PARENT && parent_exec_id interaction CLONE_PARENT can fool the ->self_exec_id/parent_exec_id logic. If we re-use the old parent, we must also re-use ->parent_exec_id to make sure exit_notify() sees the right ->xxx_exec_id's when the CLONE_PARENT'ed task exits. Also, move down the "p->parent_exec_id = p->self_exec_id" thing, to place two different cases together. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Andrew Morton Cc: David Howells Cc: Serge E. Hallyn Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a66fbde2071..4854c2c4a82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif clear_all_latency_tracing(p); - /* Our parent execution domain becomes current domain - These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; - /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; @@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ - if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; - else + p->parent_exec_id = current->parent_exec_id; + } else { p->real_parent = current; + p->parent_exec_id = current->self_exec_id; + } spin_lock(¤t->sighand->siglock); -- cgit v1.2.3 From be50b8342dead8cacf57d4839240106b225d31f5 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Tue, 10 Mar 2009 12:55:56 -0700 Subject: kernel/user.c: fix a memory leak when freeing up non-init usernamespaces users We were returning early in the sysfs directory cleanup function if the user belonged to a non init usernamespace. Due to this a lot of the cleanup was not done and we were left with a leak. Fix the leak. Reported-by: Serge Hallyn Signed-off-by: Dhaval Giani Acked-by: Serge Hallyn Tested-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 6a9b696128c..fbb300e6191 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -286,14 +286,12 @@ int __init uids_sysfs_init(void) /* work function to remove sysfs directory for a user and free up * corresponding structures. */ -static void remove_user_sysfs_dir(struct work_struct *w) +static void cleanup_user_struct(struct work_struct *w) { struct user_struct *up = container_of(w, struct user_struct, work); unsigned long flags; int remove_user = 0; - if (up->user_ns != &init_user_ns) - return; /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() * atomic. */ @@ -312,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w) if (!remove_user) goto done; - kobject_uevent(&up->kobj, KOBJ_REMOVE); - kobject_del(&up->kobj); - kobject_put(&up->kobj); + if (up->user_ns == &init_user_ns) { + kobject_uevent(&up->kobj, KOBJ_REMOVE); + kobject_del(&up->kobj); + kobject_put(&up->kobj); + } sched_destroy_user(up); key_put(up->uid_keyring); @@ -335,7 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags) atomic_inc(&up->__count); spin_unlock_irqrestore(&uidhash_lock, flags); - INIT_WORK(&up->work, remove_user_sysfs_dir); + INIT_WORK(&up->work, cleanup_user_struct); schedule_work(&up->work); } -- cgit v1.2.3 From 6e2b75740bed35df98b8113300579e13ed2ce848 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 16 Mar 2009 18:13:36 -0400 Subject: module: fix refptr allocation and release order Impact: fix ref-after-free crash on failed module load Fix refptr bug: Change refptr allocation and release order not to access a module data structure pointed by 'mod' after freeing mod->module_core. This bug will cause kernel panic(e.g. failed to find undefined symbols). This bug was reported on systemtap bugzilla. http://sources.redhat.com/bugzilla/show_bug.cgi?id=9927 Signed-off-by: Masami Hiramatsu Cc: Eric Dumazet Signed-off-by: Rusty Russell --- kernel/module.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ba22484a987..1196f5d1170 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2015,14 +2015,6 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto free_mod; -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), - mod->name); - if (!mod->refptr) { - err = -ENOMEM; - goto free_mod; - } -#endif if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, @@ -2030,7 +2022,7 @@ static noinline struct module *load_module(void __user *umod, mod->name); if (!percpu) { err = -ENOMEM; - goto free_percpu; + goto free_mod; } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; mod->percpu = percpu; @@ -2082,6 +2074,14 @@ static noinline struct module *load_module(void __user *umod, /* Module has been moved. */ mod = (void *)sechdrs[modindex].sh_addr; +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), + mod->name); + if (!mod->refptr) { + err = -ENOMEM; + goto free_init; + } +#endif /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); @@ -2288,15 +2288,17 @@ static noinline struct module *load_module(void __user *umod, ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); + free_init: +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + percpu_modfree(mod->refptr); +#endif module_free(mod, mod->module_init); free_core: module_free(mod, mod->module_core); + /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: if (percpu) percpu_modfree(percpu); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - percpu_modfree(mod->refptr); -#endif free_mod: kfree(args); free_hdr: -- cgit v1.2.3 From 53da1d9456fe7f87a920a78fdbdcf1225d197cb7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 23 Mar 2009 16:07:24 +0100 Subject: fix ptrace slowness This patch fixes bug #12208: Bug-Entry : http://bugzilla.kernel.org/show_bug.cgi?id=12208 Subject : uml is very slow on 2.6.28 host This turned out to be not a scheduler regression, but an already existing problem in ptrace being triggered by subtle scheduler changes. The problem is this: - task A is ptracing task B - task B stops on a trace event - task A is woken up and preempts task B - task A calls ptrace on task B, which does ptrace_check_attach() - this calls wait_task_inactive(), which sees that task B is still on the runq - task A goes to sleep for a jiffy - ... Since UML does lots of the above sequences, those jiffies quickly add up to make it slow as hell. This patch solves this by not rescheduling in read_unlock() after ptrace_stop() has woken up the tracer. Thanks to Oleg Nesterov and Ingo Molnar for the feedback. Signed-off-by: Miklos Szeredi CC: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/signal.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2a74fe87c0d..1c8814481a1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1575,7 +1575,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) read_lock(&tasklist_lock); if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); + /* + * Don't want to allow preemption here, because + * sys_ptrace() needs this task to be inactive. + * + * XXX: implement read_unlock_no_resched(). + */ + preempt_disable(); read_unlock(&tasklist_lock); + preempt_enable_no_resched(); schedule(); } else { /* -- cgit v1.2.3 From e9d376f0fa66bd630fe27403669c6ae6c22a868f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 5 Feb 2009 11:51:38 -0500 Subject: dynamic debug: combine dprintk and dynamic printk This patch combines Greg Bank's dprintk() work with the existing dynamic printk patchset, we are now calling it 'dynamic debug'. The new feature of this patchset is a richer /debugfs control file interface, (an example output from my system is at the bottom), which allows fined grained control over the the debug output. The output can be controlled by function, file, module, format string, and line number. for example, enabled all debug messages in module 'nf_conntrack': echo -n 'module nf_conntrack +p' > /mnt/debugfs/dynamic_debug/control to disable them: echo -n 'module nf_conntrack -p' > /mnt/debugfs/dynamic_debug/control A further explanation can be found in the documentation patch. Signed-off-by: Greg Banks Signed-off-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1196f5d1170..77672233387 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -822,7 +822,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - unregister_dynamic_debug_module(mod->name); + ddebug_remove_module(mod->name); free_module(mod); out: @@ -1827,19 +1827,13 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ -static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) +static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) { -#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG - unsigned int i; - - for (i = 0; i < num; i++) { - register_dynamic_debug_module(debug[i].modname, - debug[i].type, - debug[i].logical_modname, - debug[i].flag_names, - debug[i].hash, debug[i].hash2); - } -#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ +#ifdef CONFIG_DYNAMIC_DEBUG + if (ddebug_add_module(debug, num, debug->modname)) + printk(KERN_ERR "dynamic debug error adding module: %s\n", + debug->modname); +#endif } static void *module_alloc_update_bounds(unsigned long size) @@ -2213,12 +2207,13 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); if (!mod->taints) { - struct mod_debug *debug; + struct _ddebug *debug; unsigned int num_debug; debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); - dynamic_printk_setup(debug, num_debug); + if (debug) + dynamic_debug_setup(debug, num_debug); } /* sechdrs[0].sh_size is always zero */ -- cgit v1.2.3