10 files changed, 127 insertions, 148 deletions
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 32b394f3b854..6eb75b80488c 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -103,6 +103,11 @@ static inline void hard_irq_disable(void)
 /* include/linux/interrupt.h needs hard_irq_disable to be a macro */
 #define hard_irq_disable	hard_irq_disable
 
+static inline bool lazy_irq_pending(void)
+{
+	return !!(get_paca()->irq_happened & ~PACA_IRQ_HARD_DIS);
+}
+
 /*
  * This is called by asynchronous interrupts to conditionally
  * re-enable hard interrupts when soft-disabled after having
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index ed1718feb9d9..5971c85df136 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -558,27 +558,54 @@ _GLOBAL(ret_from_except_lite)
 	mtmsrd	r10,1		  /* Update machine state */
 #endif /* CONFIG_PPC_BOOK3E */
 
-#ifdef CONFIG_PREEMPT
 	clrrdi	r9,r1,THREAD_SHIFT	/* current_thread_info() */
-	li	r0,_TIF_NEED_RESCHED	/* bits to check */
 	ld	r3,_MSR(r1)
 	ld	r4,TI_FLAGS(r9)
-	/* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */
-	rlwimi	r0,r3,32+TIF_SIGPENDING-MSR_PR_LG,_TIF_SIGPENDING
-	and.	r0,r4,r0	/* check NEED_RESCHED and maybe SIGPENDING */
-	bne	do_work
-
-#else /* !CONFIG_PREEMPT */
-	ld	r3,_MSR(r1)	/* Returning to user mode? */
 	andi.	r3,r3,MSR_PR
-	beq	restore		/* if not, just restore regs and return */
+	beq	resume_kernel
 
 	/* Check current_thread_info()->flags */
+	andi.	r0,r4,_TIF_USER_WORK_MASK
+	beq	restore
+
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq	1f
+	bl	.restore_interrupts
+	bl	.schedule
+	b	.ret_from_except_lite
+
+1:	bl	.save_nvgprs
+	bl	.restore_interrupts
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_notify_resume
+	b	.ret_from_except
+
+resume_kernel:
+#ifdef CONFIG_PREEMPT
+	/* Check if we need to preempt */
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq+	restore
+	/* Check that preempt_count() == 0 and interrupts are enabled */
+	lwz	r8,TI_PREEMPT(r9)
+	cmpwi	cr1,r8,0
+	ld	r0,SOFTE(r1)
+	cmpdi	r0,0
+	crandc	eq,cr1*4+eq,eq
+	bne	restore
+
+	/*
+	 * Here we are preempting the current task. We want to make
+	 * sure we are soft-disabled first
+	 */
+	SOFT_DISABLE_INTS(r3,r4)
+1:	bl	.preempt_schedule_irq
+
+	/* Re-test flags and eventually loop */
 	clrrdi	r9,r1,THREAD_SHIFT
 	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_USER_WORK_MASK
-	bne	do_work
-#endif /* !CONFIG_PREEMPT */
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	bne	1b
+#endif /* CONFIG_PREEMPT */
 
 	.globl	fast_exc_return_irq
 fast_exc_return_irq:
@@ -759,50 +786,6 @@ restore_check_irq_replay:
 #endif /* CONFIG_PPC_BOOK3E */
 1:	b	.ret_from_except /* What else to do here ? */
  
-
-
-3:
-do_work:
-#ifdef CONFIG_PREEMPT
-	andi.	r0,r3,MSR_PR	/* Returning to user mode? */
-	bne	user_work
-	/* Check that preempt_count() == 0 and interrupts are enabled */
-	lwz	r8,TI_PREEMPT(r9)
-	cmpwi	cr1,r8,0
-	ld	r0,SOFTE(r1)
-	cmpdi	r0,0
-	crandc	eq,cr1*4+eq,eq
-	bne	restore
-
-	/*
-	 * Here we are preempting the current task. We want to make
-	 * sure we are soft-disabled first
-	 */
-	SOFT_DISABLE_INTS(r3,r4)
-1:	bl	.preempt_schedule_irq
-
-	/* Re-test flags and eventually loop */
-	clrrdi	r9,r1,THREAD_SHIFT
-	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_NEED_RESCHED
-	bne	1b
-	b	restore
-
-user_work:
-#endif /* CONFIG_PREEMPT */
-
-	andi.	r0,r4,_TIF_NEED_RESCHED
-	beq	1f
-	bl	.restore_interrupts
-	bl	.schedule
-	b	.ret_from_except_lite
-
-1:	bl	.save_nvgprs
-	bl	.restore_interrupts
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.do_notify_resume
-	b	.ret_from_except
-
 unrecov_restore:
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.unrecoverable_exception
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 7835a5e1ea5f..1b415027ec0e 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -277,7 +277,7 @@ EXPORT_SYMBOL(arch_local_irq_restore);
  * NOTE: This is called with interrupts hard disabled but not marked
  * as such in paca->irq_happened, so we need to resync this.
  */
-void restore_interrupts(void)
+void notrace restore_interrupts(void)
 {
 	if (irqs_disabled()) {
 		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1b488e5305c5..0794a3017b1b 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1312,7 +1312,7 @@ static struct opal_secondary_data {
 
 extern char opal_secondary_entry;
 
-static void prom_query_opal(void)
+static void __init prom_query_opal(void)
 {
 	long rc;
 
@@ -1436,7 +1436,7 @@ static void __init prom_opal_hold_cpus(void)
 	prom_debug("prom_opal_hold_cpus: end...\n");
 }
 
-static void prom_opal_takeover(void)
+static void __init prom_opal_takeover(void)
 {
 	struct opal_secondary_data *data = &RELOC(opal_secondary_data);
 	struct opal_takeover_args *args = &data->args;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index c6af1d623839..3abe1b86e583 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -268,24 +268,45 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 	return err;
 }
 
-static void kvmppc_update_vpa(struct kvm *kvm, struct kvmppc_vpa *vpap)
+static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
 {
+	struct kvm *kvm = vcpu->kvm;
 	void *va;
 	unsigned long nb;
+	unsigned long gpa;
 
-	vpap->update_pending = 0;
-	va = NULL;
-	if (vpap->next_gpa) {
-		va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb);
-		if (nb < vpap->len) {
-			/*
-			 * If it's now too short, it must be that userspace
-			 * has changed the mappings underlying guest memory,
-			 * so unregister the region.
-			 */
+	/*
+	 * We need to pin the page pointed to by vpap->next_gpa,
+	 * but we can't call kvmppc_pin_guest_page under the lock
+	 * as it does get_user_pages() and down_read().  So we
+	 * have to drop the lock, pin the page, then get the lock
+	 * again and check that a new area didn't get registered
+	 * in the meantime.
+	 */
+	for (;;) {
+		gpa = vpap->next_gpa;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		va = NULL;
+		nb = 0;
+		if (gpa)
+			va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb);
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		if (gpa == vpap->next_gpa)
+			break;
+		/* sigh... unpin that one and try again */
+		if (va)
 			kvmppc_unpin_guest_page(kvm, va);
-			va = NULL;
-		}
+	}
+
+	vpap->update_pending = 0;
+	if (va && nb < vpap->len) {
+		/*
+		 * If it's now too short, it must be that userspace
+		 * has changed the mappings underlying guest memory,
+		 * so unregister the region.
+		 */
+		kvmppc_unpin_guest_page(kvm, va);
+		va = NULL;
 	}
 	if (vpap->pinned_addr)
 		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr);
@@ -296,20 +317,18 @@ static void kvmppc_update_vpa(struct kvm *kvm, struct kvmppc_vpa *vpap)
 
 static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
 {
-	struct kvm *kvm = vcpu->kvm;
-
 	spin_lock(&vcpu->arch.vpa_update_lock);
 	if (vcpu->arch.vpa.update_pending) {
-		kvmppc_update_vpa(kvm, &vcpu->arch.vpa);
+		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
 		init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
 	}
 	if (vcpu->arch.dtl.update_pending) {
-		kvmppc_update_vpa(kvm, &vcpu->arch.dtl);
+		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
 		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
 		vcpu->arch.dtl_index = 0;
 	}
 	if (vcpu->arch.slb_shadow.update_pending)
-		kvmppc_update_vpa(kvm, &vcpu->arch.slb_shadow);
+		kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
 	spin_unlock(&vcpu->arch.vpa_update_lock);
 }
 
@@ -800,12 +819,39 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	struct kvm_vcpu *vcpu, *vcpu0, *vnext;
 	long ret;
 	u64 now;
-	int ptid, i;
+	int ptid, i, need_vpa_update;
 
 	/* don't start if any threads have a signal pending */
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+	need_vpa_update = 0;
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		if (signal_pending(vcpu->arch.run_task))
 			return 0;
+		need_vpa_update |= vcpu->arch.vpa.update_pending |
+			vcpu->arch.slb_shadow.update_pending |
+			vcpu->arch.dtl.update_pending;
+	}
+
+	/*
+	 * Initialize *vc, in particular vc->vcore_state, so we can
+	 * drop the vcore lock if necessary.
+	 */
+	vc->n_woken = 0;
+	vc->nap_count = 0;
+	vc->entry_exit_count = 0;
+	vc->vcore_state = VCORE_RUNNING;
+	vc->in_guest = 0;
+	vc->napping_threads = 0;
+
+	/*
+	 * Updating any of the vpas requires calling kvmppc_pin_guest_page,
+	 * which can't be called with any spinlocks held.
+	 */
+	if (need_vpa_update) {
+		spin_unlock(&vc->lock);
+		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+			kvmppc_update_vpas(vcpu);
+		spin_lock(&vc->lock);
+	}
 
 	/*
 	 * Make sure we are running on thread 0, and that
@@ -838,20 +884,10 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		if (vcpu->arch.ceded)
 			vcpu->arch.ptid = ptid++;
 
-	vc->n_woken = 0;
-	vc->nap_count = 0;
-	vc->entry_exit_count = 0;
-	vc->vcore_state = VCORE_RUNNING;
 	vc->stolen_tb += mftb() - vc->preempt_tb;
-	vc->in_guest = 0;
 	vc->pcpu = smp_processor_id();
-	vc->napping_threads = 0;
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		kvmppc_start_thread(vcpu);
-		if (vcpu->arch.vpa.update_pending ||
-		    vcpu->arch.slb_shadow.update_pending ||
-		    vcpu->arch.dtl.update_pending)
-			kvmppc_update_vpas(vcpu);
 		kvmppc_create_dtl_entry(vcpu, vc);
 	}
 	/* Grab any remaining hw threads so they can't go into the kernel */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b6edbb3b4a54..6e8f677f5646 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -635,7 +635,7 @@ static inline int __init read_usm_ranges(const u32 **usm)
  */
 static void __init parse_drconf_memory(struct device_node *memory)
 {
-	const u32 *dm, *usm;
+	const u32 *uninitialized_var(dm), *usm;
 	unsigned int n, rc, ranges, is_kexec_kdump = 0;
 	unsigned long lmb_size, base, size, sz;
 	int nid;
diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S
index 55ba3855a97f..7d3a3b5619a2 100644
--- a/arch/powerpc/net/bpf_jit_64.S
+++ b/arch/powerpc/net/bpf_jit_64.S
@@ -105,6 +105,7 @@ sk_load_byte_msh_positive_offset:
 	mr	r4, r_addr;					\
 	li	r6, SIZE;					\
 	bl	skb_copy_bits;					\
+	nop;							\
 	/* R3 = 0 on success */					\
 	addi	r1, r1, BPF_PPC_SLOWPATH_FRAME;			\
 	ld	r0, 16(r1);					\
@@ -156,6 +157,7 @@ bpf_slow_path_byte_msh:
 	mr	r4, r_addr;					\
 	li	r5, SIZE;					\
 	bl	bpf_internal_load_pointer_neg_helper;		\
+	nop;							\
 	/* R3 != 0 on success */				\
 	addi	r1, r1, BPF_PPC_SLOWPATH_FRAME;			\
 	ld	r0, 16(r1);					\
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 0915b1ad66ce..2d311c0caf8e 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -106,7 +106,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 		tcep++;
 	}
 
-	if (tbl->it_type == TCE_PCI_SWINV_CREATE)
+	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
 		tce_invalidate_pSeries_sw(tbl, tces, tcep - 1);
 	return 0;
 }
@@ -121,7 +121,7 @@ static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
 	while (npages--)
 		*(tcep++) = 0;
 
-	if (tbl->it_type == TCE_PCI_SWINV_FREE)
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
 		tce_invalidate_pSeries_sw(tbl, tces, tcep - 1);
 }
 
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 36f957f31842..8733a86ad52e 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -68,9 +68,7 @@ static const char *pseries_nvram_os_partitions[] = {
 };
 
 static void oops_to_nvram(struct kmsg_dumper *dumper,
-		enum kmsg_dump_reason reason,
-		const char *old_msgs, unsigned long old_len,
-		const char *new_msgs, unsigned long new_len);
+			  enum kmsg_dump_reason reason);
 
 static struct kmsg_dumper nvram_kmsg_dumper = {
 	.dump = oops_to_nvram
@@ -504,28 +502,6 @@ int __init pSeries_nvram_init(void)
 }
 
 /*
- * Try to capture the last capture_len bytes of the printk buffer.  Return
- * the amount actually captured.
- */
-static size_t capture_last_msgs(const char *old_msgs, size_t old_len,
-				const char *new_msgs, size_t new_len,
-				char *captured, size_t capture_len)
-{
-	if (new_len >= capture_len) {
-		memcpy(captured, new_msgs + (new_len - capture_len),
-								capture_len);
-		return capture_len;
-	} else {
-		/* Grab the end of old_msgs. */
-		size_t old_tail_len = min(old_len, capture_len - new_len);
-		memcpy(captured, old_msgs + (old_len - old_tail_len),
-								old_tail_len);
-		memcpy(captured + old_tail_len, new_msgs, new_len);
-		return old_tail_len + new_len;
-	}
-}
-
-/*
  * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
  * would logging this oops/panic overwrite an RTAS event that rtas_errd
  * hasn't had a chance to read and process?  Return 1 if so, else 0.
@@ -541,27 +517,6 @@ static int clobbering_unread_rtas_event(void)
 						NVRAM_RTAS_READ_TIMEOUT);
 }
 
-/* Squeeze out each line's <n> severity prefix. */
-static size_t elide_severities(char *buf, size_t len)
-{
-	char *in, *out, *buf_end = buf + len;
-	/* Assume a <n> at the very beginning marks the start of a line. */
-	int newline = 1;
-
-	in = out = buf;
-	while (in < buf_end) {
-		if (newline && in+3 <= buf_end &&
-				*in == '<' && isdigit(in[1]) && in[2] == '>') {
-			in += 3;
-			newline = 0;
-		} else {
-			newline = (*in == '\n');
-			*out++ = *in++;
-		}
-	}
-	return out - buf;
-}
-
 /* Derived from logfs_compress() */
 static int nvram_compress(const void *in, void *out, size_t inlen,
 							size_t outlen)
@@ -619,9 +574,7 @@ static int zip_oops(size_t text_len)
  * partition.  If that's too much, go back and capture uncompressed text.
  */
 static void oops_to_nvram(struct kmsg_dumper *dumper,
-		enum kmsg_dump_reason reason,
-		const char *old_msgs, unsigned long old_len,
-		const char *new_msgs, unsigned long new_len)
+			  enum kmsg_dump_reason reason)
 {
 	static unsigned int oops_count = 0;
 	static bool panicking = false;
@@ -660,14 +613,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 		return;
 
 	if (big_oops_buf) {
-		text_len = capture_last_msgs(old_msgs, old_len,
-			new_msgs, new_len, big_oops_buf, big_oops_buf_sz);
-		text_len = elide_severities(big_oops_buf, text_len);
+		kmsg_dump_get_buffer(dumper, false,
+				     big_oops_buf, big_oops_buf_sz, &text_len);
 		rc = zip_oops(text_len);
 	}
 	if (rc != 0) {
-		text_len = capture_last_msgs(old_msgs, old_len,
-				new_msgs, new_len, oops_data, oops_data_sz);
+		kmsg_dump_rewind(dumper);
+		kmsg_dump_get_buffer(dumper, true,
+				     oops_data, oops_data_sz, &text_len);
 		err_type = ERR_TYPE_KERNEL_PANIC;
 		*oops_len = (u16) text_len;
 	}
diff --git a/arch/powerpc/platforms/pseries/processor_idle.c b/arch/powerpc/platforms/pseries/processor_idle.c
index 41a34bc4a9a2..e61483e8e960 100644
--- a/arch/powerpc/platforms/pseries/processor_idle.c
+++ b/arch/powerpc/platforms/pseries/processor_idle.c
@@ -106,7 +106,7 @@ static void check_and_cede_processor(void)
 	 * we first hard disable then check.
 	 */
 	hard_irq_disable();
-	if (get_paca()->irq_happened == 0)
+	if (!lazy_irq_pending())
 		cede_processor();
 }