From 1d1e97562e5e2ac60fb7b25437ba619f95f67fab Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Thu, 26 Feb 2009 18:27:38 -0600
Subject: keys: distinguish per-uid keys in different namespaces

per-uid keys were looked by uid only.  Use the user namespace
to distinguish the same uid in different namespaces.

This does not address key_permission.  So a task can for instance
try to join a keyring owned by the same uid in another namespace.
That will be handled by a separate patch.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 477b6660f44..d8b332c3ae3 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,7 +20,7 @@
 
 struct user_namespace init_user_ns = {
 	.kref = {
-		.refcount	= ATOMIC_INIT(1),
+		.refcount	= ATOMIC_INIT(2),
 	},
 	.creator = &root_user,
 };
-- 
cgit v1.2.3


From 64ca5ab913f1594ef316556e65f5eae63ff50cee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 4 Mar 2009 12:11:56 -0800
Subject: rcu: increment quiescent state counter in ksoftirqd()

If a machine is flooded by network frames, a cpu can loop
100% of its time inside ksoftirqd() without calling schedule().
This can delay RCU grace period to insane values.

Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem.

Paul: "This regression was a result of the recent change from
"schedule()" to "cond_resched()", which got rid of that quiescent
state in the common case where a reschedule is not needed".

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8..9041ea7948f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu)
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
+			rcu_qsctr_inc((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3


From 6d5b5acca9e566515ef3f1ed617e7295c4f94345 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 9 Mar 2009 13:31:59 +0100
Subject: Fix fixpoint divide exception in acct_update_integrals

Frans Pop reported the crash below when running an s390 kernel under Hercules:

  Kernel BUG at 000738b4  verbose debug info unavailable!
  fixpoint divide exception: 0009  #1! SMP
  Modules linked in: nfs lockd nfs_acl sunrpc ctcm fsm tape_34xx
     cu3088 tape ccwgroup tape_class ext3 jbd mbcache dm_mirror dm_log dm_snapshot
     dm_mod dasd_eckd_mod dasd_mod
  CPU: 0 Not tainted 2.6.27.19 #13
  Process awk (pid: 2069, task: 0f9ed9b8, ksp: 0f4f7d18)
  Krnl PSW : 070c1000 800738b4 (acct_update_integrals+0x4c/0x118)
             R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0
  Krnl GPRS: 00000000 000007d0 7fffffff fffff830
             00000000 ffffffff 00000002 0f9ed9b8
             00000000 00008ca0 00000000 0f9ed9b8
             0f9edda4 8007386e 0f4f7ec8 0f4f7e98
  Krnl Code: 800738aa: a71807d0         lhi     %r1,2000
             800738ae: 8c200001         srdl    %r2,1
             800738b2: 1d21             dr      %r2,%r1
            >800738b4: 5810d10e         l       %r1,270(%r13)
             800738b8: 1823             lr      %r2,%r3
             800738ba: 4130f060         la      %r3,96(%r15)
             800738be: 0de1             basr    %r14,%r1
             800738c0: 5800f060         l       %r0,96(%r15)
  Call Trace:
  ( <000000000004fdea>! blocking_notifier_call_chain+0x1e/0x2c)
    <0000000000038502>! do_exit+0x106/0x7c0
    <0000000000038c36>! do_group_exit+0x7a/0xb4
    <0000000000038c8e>! SyS_exit_group+0x1e/0x30
    <0000000000021c28>! sysc_do_restart+0x12/0x16
    <0000000077e7e924>! 0x77e7e924

Reason for this is that cpu time accounting usually only happens from
interrupt context, but acct_update_integrals gets also called from
process context with interrupts enabled.

So in acct_update_integrals we may end up with the following scenario:

Between reading tsk->stime/tsk->utime and tsk->acct_timexpd an interrupt
happens which updates accouting values.  This causes acct_timexpd to be
greater than the former stime + utime.  The subsequent calculation of

	dtime = cputime_sub(time, tsk->acct_timexpd);

will be negative and the division performed by

	cputime_to_jiffies(dtime)

will generate an exception since the result won't fit into a 32 bit
register.

In order to fix this just always disable interrupts while accessing any
of the accounting values.

Reported by: Frans Pop <elendil@planet.nl>
Tested by: Frans Pop <elendil@planet.nl>
Cc: stable@kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/tsacct.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 43f891b05a4..00d59d048ed 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk)
 	if (likely(tsk->mm)) {
 		cputime_t time, dtime;
 		struct timeval value;
+		unsigned long flags;
 		u64 delta;
 
+		local_irq_save(flags);
 		time = tsk->stime + tsk->utime;
 		dtime = cputime_sub(time, tsk->acct_timexpd);
 		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
@@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk)
 		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
-			return;
+			goto out;
 		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+	out:
+		local_irq_restore(flags);
 	}
 }
 
-- 
cgit v1.2.3


From 2d5516cbb9daf7d0e342a2e3b0fc6f8c39a81205 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 2 Mar 2009 22:58:45 +0100
Subject: copy_process: fix CLONE_PARENT && parent_exec_id interaction

CLONE_PARENT can fool the ->self_exec_id/parent_exec_id logic. If we
re-use the old parent, we must also re-use ->parent_exec_id to make
sure exit_notify() sees the right ->xxx_exec_id's when the CLONE_PARENT'ed
task exits.

Also, move down the "p->parent_exec_id = p->self_exec_id" thing, to place
two different cases together.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a66fbde2071..4854c2c4a82 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 	clear_all_latency_tracing(p);
 
-	/* Our parent execution domain becomes current domain
-	   These must match for thread signalling to apply */
-	p->parent_exec_id = p->self_exec_id;
-
 	/* ok, now we should be set up.. */
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
@@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		set_task_cpu(p, smp_processor_id());
 
 	/* CLONE_PARENT re-uses the old parent */
-	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
 		p->real_parent = current->real_parent;
-	else
+		p->parent_exec_id = current->parent_exec_id;
+	} else {
 		p->real_parent = current;
+		p->parent_exec_id = current->self_exec_id;
+	}
 
 	spin_lock(&current->sighand->siglock);
 
-- 
cgit v1.2.3


From be50b8342dead8cacf57d4839240106b225d31f5 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Tue, 10 Mar 2009 12:55:56 -0700
Subject: kernel/user.c: fix a memory leak when freeing up non-init
 usernamespaces users

We were returning early in the sysfs directory cleanup function if the
user belonged to a non init usernamespace.  Due to this a lot of the
cleanup was not done and we were left with a leak.  Fix the leak.

Reported-by: Serge Hallyn <serue@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Tested-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 6a9b696128c..fbb300e6191 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -286,14 +286,12 @@ int __init uids_sysfs_init(void)
 /* work function to remove sysfs directory for a user and free up
  * corresponding structures.
  */
-static void remove_user_sysfs_dir(struct work_struct *w)
+static void cleanup_user_struct(struct work_struct *w)
 {
 	struct user_struct *up = container_of(w, struct user_struct, work);
 	unsigned long flags;
 	int remove_user = 0;
 
-	if (up->user_ns != &init_user_ns)
-		return;
 	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
 	 * atomic.
 	 */
@@ -312,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w)
 	if (!remove_user)
 		goto done;
 
-	kobject_uevent(&up->kobj, KOBJ_REMOVE);
-	kobject_del(&up->kobj);
-	kobject_put(&up->kobj);
+	if (up->user_ns == &init_user_ns) {
+		kobject_uevent(&up->kobj, KOBJ_REMOVE);
+		kobject_del(&up->kobj);
+		kobject_put(&up->kobj);
+	}
 
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
@@ -335,7 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
 	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 
-	INIT_WORK(&up->work, remove_user_sysfs_dir);
+	INIT_WORK(&up->work, cleanup_user_struct);
 	schedule_work(&up->work);
 }
 
-- 
cgit v1.2.3


From 6e2b75740bed35df98b8113300579e13ed2ce848 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 16 Mar 2009 18:13:36 -0400
Subject: module: fix refptr allocation and release order

Impact: fix ref-after-free crash on failed module load

Fix refptr bug: Change refptr allocation and release order not to access a module
data structure pointed by 'mod' after freeing mod->module_core.
This bug will cause kernel panic(e.g. failed to find undefined symbols).

This bug was reported on systemtap bugzilla.
http://sources.redhat.com/bugzilla/show_bug.cgi?id=9927

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ba22484a987..1196f5d1170 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2015,14 +2015,6 @@ static noinline struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto free_mod;
 
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
-	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
-				      mod->name);
-	if (!mod->refptr) {
-		err = -ENOMEM;
-		goto free_mod;
-	}
-#endif
 	if (pcpuindex) {
 		/* We have a special allocation for this section. */
 		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -2030,7 +2022,7 @@ static noinline struct module *load_module(void __user *umod,
 					 mod->name);
 		if (!percpu) {
 			err = -ENOMEM;
-			goto free_percpu;
+			goto free_mod;
 		}
 		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 		mod->percpu = percpu;
@@ -2082,6 +2074,14 @@ static noinline struct module *load_module(void __user *umod,
 	/* Module has been moved. */
 	mod = (void *)sechdrs[modindex].sh_addr;
 
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+				      mod->name);
+	if (!mod->refptr) {
+		err = -ENOMEM;
+		goto free_init;
+	}
+#endif
 	/* Now we've moved module, initialize linked lists, etc. */
 	module_unload_init(mod);
 
@@ -2288,15 +2288,17 @@ static noinline struct module *load_module(void __user *umod,
 	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
+ free_init:
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	percpu_modfree(mod->refptr);
+#endif
 	module_free(mod, mod->module_init);
  free_core:
 	module_free(mod, mod->module_core);
+	/* mod will be freed with core. Don't access it beyond this line! */
  free_percpu:
 	if (percpu)
 		percpu_modfree(percpu);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
-	percpu_modfree(mod->refptr);
-#endif
  free_mod:
 	kfree(args);
  free_hdr:
-- 
cgit v1.2.3


From 53da1d9456fe7f87a920a78fdbdcf1225d197cb7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 23 Mar 2009 16:07:24 +0100
Subject: fix ptrace slowness

This patch fixes bug #12208:

  Bug-Entry       : http://bugzilla.kernel.org/show_bug.cgi?id=12208
  Subject         : uml is very slow on 2.6.28 host

This turned out to be not a scheduler regression, but an already
existing problem in ptrace being triggered by subtle scheduler
changes.

The problem is this:

 - task A is ptracing task B
 - task B stops on a trace event
 - task A is woken up and preempts task B
 - task A calls ptrace on task B, which does ptrace_check_attach()
 - this calls wait_task_inactive(), which sees that task B is still on the runq
 - task A goes to sleep for a jiffy
 - ...

Since UML does lots of the above sequences, those jiffies quickly add
up to make it slow as hell.

This patch solves this by not rescheduling in read_unlock() after
ptrace_stop() has woken up the tracer.

Thanks to Oleg Nesterov and Ingo Molnar for the feedback.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 2a74fe87c0d..1c8814481a1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1575,7 +1575,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 	read_lock(&tasklist_lock);
 	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
+		/*
+		 * Don't want to allow preemption here, because
+		 * sys_ptrace() needs this task to be inactive.
+		 *
+		 * XXX: implement read_unlock_no_resched().
+		 */
+		preempt_disable();
 		read_unlock(&tasklist_lock);
+		preempt_enable_no_resched();
 		schedule();
 	} else {
 		/*
-- 
cgit v1.2.3


From e9d376f0fa66bd630fe27403669c6ae6c22a868f Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 5 Feb 2009 11:51:38 -0500
Subject: dynamic debug: combine dprintk and dynamic printk

This patch combines Greg Bank's dprintk() work with the existing dynamic
printk patchset, we are now calling it 'dynamic debug'.

The new feature of this patchset is a richer /debugfs control file interface,
(an example output from my system is at the bottom), which allows fined grained
control over the the debug output. The output can be controlled by function,
file, module, format string, and line number.

for example, enabled all debug messages in module 'nf_conntrack':

echo -n 'module nf_conntrack +p' > /mnt/debugfs/dynamic_debug/control

to disable them:

echo -n 'module nf_conntrack -p' > /mnt/debugfs/dynamic_debug/control

A further explanation can be found in the documentation patch.

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1196f5d1170..77672233387 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -822,7 +822,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	mutex_lock(&module_mutex);
 	/* Store the name of the last unloaded module for diagnostic purposes */
 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-	unregister_dynamic_debug_module(mod->name);
+	ddebug_remove_module(mod->name);
 	free_module(mod);
 
  out:
@@ -1827,19 +1827,13 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
 
-static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
+static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
 {
-#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
-	unsigned int i;
-
-	for (i = 0; i < num; i++) {
-		register_dynamic_debug_module(debug[i].modname,
-					      debug[i].type,
-					      debug[i].logical_modname,
-					      debug[i].flag_names,
-					      debug[i].hash, debug[i].hash2);
-	}
-#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+#ifdef CONFIG_DYNAMIC_DEBUG
+	if (ddebug_add_module(debug, num, debug->modname))
+		printk(KERN_ERR "dynamic debug error adding module: %s\n",
+					debug->modname);
+#endif
 }
 
 static void *module_alloc_update_bounds(unsigned long size)
@@ -2213,12 +2207,13 @@ static noinline struct module *load_module(void __user *umod,
 	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
 
 	if (!mod->taints) {
-		struct mod_debug *debug;
+		struct _ddebug *debug;
 		unsigned int num_debug;
 
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
-		dynamic_printk_setup(debug, num_debug);
+		if (debug)
+			dynamic_debug_setup(debug, num_debug);
 	}
 
 	/* sechdrs[0].sh_size is always zero */
-- 
cgit v1.2.3