diff options
Diffstat (limited to 'arch')
275 files changed, 6840 insertions, 5760 deletions
diff --git a/arch/alpha/include/asm/cmpxchg.h b/arch/alpha/include/asm/cmpxchg.h index 429e8cd0d78e..e5117766529e 100644 --- a/arch/alpha/include/asm/cmpxchg.h +++ b/arch/alpha/include/asm/cmpxchg.h @@ -66,6 +66,4 @@ #undef __ASM__MB #undef ____cmpxchg -#define __HAVE_ARCH_CMPXCHG 1 - #endif /* _ALPHA_CMPXCHG_H */ diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 9d0ac091a52a..4a905bd667e2 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -23,8 +23,7 @@ #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/module.h> - -#include <asm/uaccess.h> +#include <linux/uaccess.h> extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *); @@ -107,7 +106,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, /* If we're in an interrupt context, or have no user context, we must not take the fault. */ - if (!mm || in_atomic()) + if (!mm || faulthandler_disabled()) goto no_context; #ifdef CONFIG_ALPHA_LARGE_VMALLOC diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 4dc64ddebece..05b5aaf5b0f9 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -53,7 +53,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - pagefault_disable(); /* implies preempt_disable() */ + pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -75,7 +75,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) ret = -ENOSYS; } - pagefault_enable(); /* subsumes preempt_enable() */ + pagefault_enable(); if (!ret) { switch (cmp) { @@ -104,7 +104,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) return ret; } -/* Compare-xchg with preemption disabled. +/* Compare-xchg with pagefaults disabled. * Notes: * -Best-Effort: Exchg happens only if compare succeeds. * If compare fails, returns; leaving retry/looping to upper layers @@ -121,7 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - pagefault_disable(); /* implies preempt_disable() */ + pagefault_disable(); /* TBD : can use llock/scond */ __asm__ __volatile__( @@ -142,7 +142,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) : "cc", "memory"); - pagefault_enable(); /* subsumes preempt_enable() */ + pagefault_enable(); *uval = val; return val; diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 6a2e006cbcce..d948e4e9d89c 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -86,7 +86,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h index d2f81e6b8c1c..6c2327e1c732 100644 --- a/arch/arm/include/asm/barrier.h +++ b/arch/arm/include/asm/barrier.h @@ -81,7 +81,7 @@ do { \ #define read_barrier_depends() do { } while(0) #define smp_read_barrier_depends() do { } while(0) -#define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) #define smp_mb__before_atomic() smp_mb() #define smp_mb__after_atomic() smp_mb() diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 4e78065a16aa..5eed82809d82 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -93,6 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; + preempt_disable(); __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" "1: " TUSER(ldr) " %1, [%4]\n" " teq %1, %2\n" @@ -104,6 +105,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "cc", "memory"); *uval = val; + preempt_enable(); + return ret; } @@ -124,7 +127,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - pagefault_disable(); /* implies preempt_disable() */ +#ifndef CONFIG_SMP + preempt_disable(); +#endif + pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -146,7 +152,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) ret = -ENOSYS; } - pagefault_enable(); /* subsumes preempt_enable() */ + pagefault_enable(); +#ifndef CONFIG_SMP + preempt_enable(); +#endif if (!ret) { switch (cmp) { diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 6333d9c17875..0d629b8f973f 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -276,7 +276,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index b98895d9fe57..ee8dfa793989 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -59,6 +59,7 @@ void *kmap_atomic(struct page *page) void *kmap; int type; + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -121,6 +122,7 @@ void __kunmap_atomic(void *kvaddr) kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); } pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); @@ -130,6 +132,7 @@ void *kmap_atomic_pfn(unsigned long pfn) int idx, type; struct page *page = pfn_to_page(pfn); + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 71f19c4dc0de..0fa47c4275cb 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -114,7 +114,7 @@ do { \ #define read_barrier_depends() do { } while(0) #define smp_read_barrier_depends() do { } while(0) -#define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) #define nop() asm volatile("nop"); #define smp_mb__before_atomic() smp_mb() diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 5f750dc96e0f..74069b3bd919 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -58,7 +58,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - pagefault_disable(); /* implies preempt_disable() */ + pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -85,7 +85,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) ret = -ENOSYS; } - pagefault_enable(); /* subsumes preempt_enable() */ + pagefault_enable(); if (!ret) { switch (cmp) { diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 96da13167d4a..0948d327d013 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -211,7 +211,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, * If we're in an interrupt or have no user context, we must not take * the fault. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) diff --git a/arch/avr32/include/asm/cmpxchg.h b/arch/avr32/include/asm/cmpxchg.h index 962a6aeab787..366bbeaeb405 100644 --- a/arch/avr32/include/asm/cmpxchg.h +++ b/arch/avr32/include/asm/cmpxchg.h @@ -70,8 +70,6 @@ extern unsigned long __cmpxchg_u64_unsupported_on_32bit_kernels( if something tries to do an invalid cmpxchg(). */ extern void __cmpxchg_called_with_bad_pointer(void); -#define __HAVE_ARCH_CMPXCHG 1 - static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) { diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h index a46f7cf3e1ea..68cf638faf48 100644 --- a/arch/avr32/include/asm/uaccess.h +++ b/arch/avr32/include/asm/uaccess.h @@ -97,7 +97,8 @@ static inline __kernel_size_t __copy_from_user(void *to, * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -116,7 +117,8 @@ static inline __kernel_size_t __copy_from_user(void *to, * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -136,7 +138,8 @@ static inline __kernel_size_t __copy_from_user(void *to, * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -158,7 +161,8 @@ static inline __kernel_size_t __copy_from_user(void *to, * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index d223a8b57c1e..c03533937a9f 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -14,11 +14,11 @@ #include <linux/pagemap.h> #include <linux/kdebug.h> #include <linux/kprobes.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/sysreg.h> #include <asm/tlb.h> -#include <asm/uaccess.h> #ifdef CONFIG_KPROBES static inline int notify_page_fault(struct pt_regs *regs, int trap) @@ -81,7 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs) * If we're in an interrupt or have no user context, we must * not take the fault... */ - if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM)) + if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM)) goto no_context; local_irq_enable(); diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 83f12f2ed9e3..3066d40a6db1 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -8,7 +8,7 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/wait.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <arch/system.h> extern int find_fixup_code(struct pt_regs *); @@ -109,11 +109,11 @@ do_page_fault(unsigned long address, struct pt_regs *regs, info.si_code = SEGV_MAPERR; /* - * If we're in an interrupt or "atomic" operation or have no + * If we're in an interrupt, have pagefaults disabled or have no * user context, we must not take the fault. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index ec4917ddf678..61d99767fe16 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c @@ -19,9 +19,9 @@ #include <linux/kernel.h> #include <linux/ptrace.h> #include <linux/hardirq.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> #include <asm/gdb-stub.h> /*****************************************************************************/ @@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(__frame)) diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c index bed9a9bd3c10..785344bbdc07 100644 --- a/arch/frv/mm/highmem.c +++ b/arch/frv/mm/highmem.c @@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page) unsigned long paddr; int type; + preempt_disable(); pagefault_disable(); type = kmap_atomic_idx_push(); paddr = page_to_phys(page); @@ -85,5 +86,6 @@ void __kunmap_atomic(void *kvaddr) } kmap_atomic_idx_pop(); pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/hexagon/include/asm/cmpxchg.h b/arch/hexagon/include/asm/cmpxchg.h index 9e7802911a57..a6e34e2acbba 100644 --- a/arch/hexagon/include/asm/cmpxchg.h +++ b/arch/hexagon/include/asm/cmpxchg.h @@ -64,7 +64,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, * looks just like atomic_cmpxchg on our arch currently with a bunch of * variable casting. */ -#define __HAVE_ARCH_CMPXCHG 1 #define cmpxchg(ptr, old, new) \ ({ \ diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h index e4127e4d6a5b..f000a382bc7f 100644 --- a/arch/hexagon/include/asm/uaccess.h +++ b/arch/hexagon/include/asm/uaccess.h @@ -36,7 +36,8 @@ * @addr: User space pointer to start of block to check * @size: Size of block to check * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Checks if a pointer to a block of memory in user space is valid. * diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h index f6769eb2bbf9..843ba435e43b 100644 --- a/arch/ia64/include/asm/barrier.h +++ b/arch/ia64/include/asm/barrier.h @@ -77,12 +77,7 @@ do { \ ___p1; \ }) -/* - * XXX check on this ---I suspect what Linus really wants here is - * acquire vs release semantics but we can't discuss this stuff with - * Linus just yet. Grrr... - */ -#define set_mb(var, value) do { (var) = (value); mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) /* * The group barrier in front of the rsm & ssm are necessary to ensure diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h index e3b3556e2e1b..a8687b1d8906 100644 --- a/arch/ia64/include/asm/irq_remapping.h +++ b/arch/ia64/include/asm/irq_remapping.h @@ -1,6 +1,4 @@ #ifndef __IA64_INTR_REMAPPING_H #define __IA64_INTR_REMAPPING_H #define irq_remapping_enabled 0 -#define dmar_alloc_hwirq create_irq -#define dmar_free_hwirq destroy_irq #endif diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h index f35109b1d907..a0e3620f8f13 100644 --- a/arch/ia64/include/uapi/asm/cmpxchg.h +++ b/arch/ia64/include/uapi/asm/cmpxchg.h @@ -61,8 +61,6 @@ extern void ia64_xchg_called_with_bad_pointer(void); * indicated by comparing RETURN with OLD. */ -#define __HAVE_ARCH_CMPXCHG 1 - /* * This function doesn't exist, so you'll get a linker error * if something tries to do an invalid cmpxchg(). diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 9dd7464f8c17..d70bf15c690a 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -165,7 +165,7 @@ static struct irq_chip dmar_msi_type = { .irq_retrigger = ia64_msi_retrigger_irq, }; -static int +static void msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { struct irq_cfg *cfg = irq_cfg + irq; @@ -186,21 +186,29 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) MSI_DATA_LEVEL_ASSERT | MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); - return 0; } -int arch_setup_dmar_msi(unsigned int irq) +int dmar_alloc_hwirq(int id, int node, void *arg) { - int ret; + int irq; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; + irq = create_irq(); + if (irq > 0) { + irq_set_handler_data(irq, arg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, + handle_edge_irq, "edge"); + msi_compose_msg(NULL, irq, &msg); + dmar_msi_write(irq, &msg); + } + + return irq; +} + +void dmar_free_hwirq(int irq) +{ + irq_set_handler_data(irq, NULL); + destroy_irq(irq); } #endif /* CONFIG_INTEL_IOMMU */ diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index ba5ba7accd0d..70b40d1205a6 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -11,10 +11,10 @@ #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/prefetch.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/processor.h> -#include <asm/uaccess.h> extern int die(char *, struct pt_regs *, long); @@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re /* * If we're in an interrupt or have no user context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; #ifdef CONFIG_VIRTUAL_MEM_MAP diff --git a/arch/m32r/include/asm/cmpxchg.h b/arch/m32r/include/asm/cmpxchg.h index de651db20b43..14bf9b739dd2 100644 --- a/arch/m32r/include/asm/cmpxchg.h +++ b/arch/m32r/include/asm/cmpxchg.h @@ -107,8 +107,6 @@ __xchg_local(unsigned long x, volatile void *ptr, int size) ((__typeof__(*(ptr)))__xchg_local((unsigned long)(x), (ptr), \ sizeof(*(ptr)))) -#define __HAVE_ARCH_CMPXCHG 1 - static inline unsigned long __cmpxchg_u32(volatile unsigned int *p, unsigned int old, unsigned int new) { diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h index 71adff209405..cac7014daef3 100644 --- a/arch/m32r/include/asm/uaccess.h +++ b/arch/m32r/include/asm/uaccess.h @@ -91,7 +91,8 @@ static inline void set_fs(mm_segment_t s) * @addr: User space pointer to start of block to check * @size: Size of block to check * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Checks if a pointer to a block of memory in user space is valid. * @@ -155,7 +156,8 @@ extern int fixup_exception(struct pt_regs *regs); * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -175,7 +177,8 @@ extern int fixup_exception(struct pt_regs *regs); * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -194,7 +197,8 @@ extern int fixup_exception(struct pt_regs *regs); * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -274,7 +278,8 @@ do { \ * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -568,7 +573,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. @@ -588,7 +594,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. * @@ -606,7 +613,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. Caller must check * the specified block with access_ok() before calling this function. @@ -626,7 +634,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. * @@ -677,7 +686,8 @@ unsigned long clear_user(void __user *mem, unsigned long len); * strlen_user: - Get the size of a string in user space. * @str: The string to measure. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Get the size of a NUL-terminated string in user space. * diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index e3d4d4890104..8f9875b7933d 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -24,9 +24,9 @@ #include <linux/vt_kern.h> /* For unblank_screen() */ #include <linux/highmem.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/m32r.h> -#include <asm/uaccess.h> #include <asm/hardirq.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -111,10 +111,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, mm = tsk->mm; /* - * If we're in an interrupt or have no user context or are running in an - * atomic region then we must not take the fault.. + * If we're in an interrupt or have no user context or have pagefaults + * disabled then we must not take the fault. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto bad_area_nosemaphore; if (error_code & ACE_USERMODE) diff --git a/arch/m68k/include/asm/cmpxchg.h b/arch/m68k/include/asm/cmpxchg.h index bc755bc620ad..83b1df80f0ac 100644 --- a/arch/m68k/include/asm/cmpxchg.h +++ b/arch/m68k/include/asm/cmpxchg.h @@ -90,7 +90,6 @@ extern unsigned long __invalid_cmpxchg_size(volatile void *, * indicated by comparing RETURN with OLD. */ #ifdef CONFIG_RMW_INSNS -#define __HAVE_ARCH_CMPXCHG 1 static inline unsigned long __cmpxchg(volatile void *p, unsigned long old, unsigned long new, int size) diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h index a823cd73dc09..b5941818346f 100644 --- a/arch/m68k/include/asm/irqflags.h +++ b/arch/m68k/include/asm/irqflags.h @@ -2,9 +2,6 @@ #define _M68K_IRQFLAGS_H #include <linux/types.h> -#ifdef CONFIG_MMU -#include <linux/preempt_mask.h> -#endif #include <linux/preempt.h> #include <asm/thread_info.h> #include <asm/entry.h> diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index b2f04aee46ec..6a94cdd0c830 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -10,10 +10,10 @@ #include <linux/ptrace.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/setup.h> #include <asm/traps.h> -#include <asm/uaccess.h> #include <asm/pgalloc.h> extern void die_if_kernel(char *, struct pt_regs *, long); @@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h index d703d8e26a65..5a696e507930 100644 --- a/arch/metag/include/asm/barrier.h +++ b/arch/metag/include/asm/barrier.h @@ -84,7 +84,7 @@ static inline void fence(void) #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) #define smp_store_release(p, v) \ do { \ diff --git a/arch/metag/include/asm/cmpxchg.h b/arch/metag/include/asm/cmpxchg.h index b1bc1be8540f..be29e3e44321 100644 --- a/arch/metag/include/asm/cmpxchg.h +++ b/arch/metag/include/asm/cmpxchg.h @@ -51,8 +51,6 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, return old; } -#define __HAVE_ARCH_CMPXCHG 1 - #define cmpxchg(ptr, o, n) \ ({ \ __typeof__(*(ptr)) _o_ = (o); \ diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index 2de5dc695a87..f57edca63609 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c @@ -105,7 +105,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, mm = tsk->mm; - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) diff --git a/arch/metag/mm/highmem.c b/arch/metag/mm/highmem.c index d71f621a2c0b..807f1b1c4e65 100644 --- a/arch/metag/mm/highmem.c +++ b/arch/metag/mm/highmem.c @@ -43,7 +43,7 @@ void *kmap_atomic(struct page *page) unsigned long vaddr; int type; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -82,6 +82,7 @@ void __kunmap_atomic(void *kvaddr) } pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); @@ -95,6 +96,7 @@ void *kmap_atomic_pfn(unsigned long pfn) unsigned long vaddr; int type; + preempt_disable(); pagefault_disable(); type = kmap_atomic_idx_push(); diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index 62942fd12672..331b0d35f89c 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -178,7 +178,8 @@ extern long __user_bad(void); * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -290,7 +291,8 @@ extern long __user_bad(void); * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index d46a5ebb7570..177dfc003643 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -107,14 +107,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11) is_write = 0; - if (unlikely(in_atomic() || !mm)) { + if (unlikely(faulthandler_disabled() || !mm)) { if (kernel_mode(regs)) goto bad_area_nosemaphore; - /* in_atomic() in user mode is really bad, + /* faulthandler_disabled() in user mode is really bad, as is current->mm == NULL. */ - pr_emerg("Page fault in user mode with in_atomic(), mm = %p\n", - mm); + pr_emerg("Page fault in user mode with faulthandler_disabled(), mm = %p\n", + mm); pr_emerg("r15 = %lx MSR = %lx\n", regs->r15, regs->msr); die("Weird page fault", regs, SIGSEGV); diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c index 5a92576fad92..2fcc5a52d84d 100644 --- a/arch/microblaze/mm/highmem.c +++ b/arch/microblaze/mm/highmem.c @@ -37,7 +37,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) unsigned long vaddr; int idx, type; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr) if (vaddr < __fix_to_virt(FIX_KMAP_END)) { pagefault_enable(); + preempt_enable(); return; } @@ -84,5 +85,6 @@ void __kunmap_atomic(void *kvaddr) #endif kmap_atomic_idx_pop(); pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h index 2b8bbbcb9be0..7ecba84656d4 100644 --- a/arch/mips/include/asm/barrier.h +++ b/arch/mips/include/asm/barrier.h @@ -112,8 +112,8 @@ #define __WEAK_LLSC_MB " \n" #endif -#define set_mb(var, value) \ - do { var = value; smp_mb(); } while (0) +#define smp_store_mb(var, value) \ + do { WRITE_ONCE(var, value); smp_mb(); } while (0) #define smp_llsc_mb() __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory") diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h index 412f945f1f5e..b71ab4a5fd50 100644 --- a/arch/mips/include/asm/cmpxchg.h +++ b/arch/mips/include/asm/cmpxchg.h @@ -138,8 +138,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz __xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))); \ }) -#define __HAVE_ARCH_CMPXCHG 1 - #define __cmpxchg_asm(ld, st, m, old, new) \ ({ \ __typeof(*(m)) __ret; \ diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index bf8b32450ef6..9722357d2854 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -103,7 +103,8 @@ extern u64 __ua_limit; * @addr: User space pointer to start of block to check * @size: Size of block to check * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Checks if a pointer to a block of memory in user space is valid. * @@ -138,7 +139,8 @@ extern u64 __ua_limit; * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -157,7 +159,8 @@ extern u64 __ua_limit; * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -177,7 +180,8 @@ extern u64 __ua_limit; * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -199,7 +203,8 @@ extern u64 __ua_limit; * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -498,7 +503,8 @@ extern void __put_user_unknown(void); * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -517,7 +523,8 @@ extern void __put_user_unknown(void); * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -537,7 +544,8 @@ extern void __put_user_unknown(void); * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -559,7 +567,8 @@ extern void __put_user_unknown(void); * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -815,7 +824,8 @@ extern size_t __copy_user(void *__to, const void *__from, size_t __n); * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. @@ -888,7 +898,8 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n); * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. * @@ -1075,7 +1086,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n); * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. Caller must check * the specified block with access_ok() before calling this function. @@ -1107,7 +1119,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n); * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. * @@ -1329,7 +1342,8 @@ strncpy_from_user(char *__to, const char __user *__from, long __len) * strlen_user: - Get the size of a string in user space. * @str: The string to measure. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Get the size of a NUL-terminated string in user space. * @@ -1398,7 +1412,8 @@ static inline long __strnlen_user(const char __user *s, long n) * strnlen_user: - Get the size of a string in user space. * @str: The string to measure. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Get the size of a NUL-terminated string in user space. * diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h index 06805e09bcd3..0b85f827cd18 100644 --- a/arch/mips/kernel/signal-common.h +++ b/arch/mips/kernel/signal-common.h @@ -28,12 +28,7 @@ extern void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, extern int fpcsr_pending(unsigned int __user *fpcsr); /* Make sure we will not lose FPU ownership */ -#ifdef CONFIG_PREEMPT -#define lock_fpu_owner() preempt_disable() -#define unlock_fpu_owner() preempt_enable() -#else -#define lock_fpu_owner() pagefault_disable() -#define unlock_fpu_owner() pagefault_enable() -#endif +#define lock_fpu_owner() ({ preempt_disable(); pagefault_disable(); }) +#define unlock_fpu_owner() ({ pagefault_enable(); preempt_enable(); }) #endif /* __SIGNAL_COMMON_H */ diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 7ff8637e530d..36c0f26fac6b 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -21,10 +21,10 @@ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/perf_event.h> +#include <linux/uaccess.h> #include <asm/branch.h> #include <asm/mmu_context.h> -#include <asm/uaccess.h> #include <asm/ptrace.h> #include <asm/highmem.h> /* For VMALLOC_END */ #include <linux/kdebug.h> @@ -94,7 +94,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto bad_area_nosemaphore; if (user_mode(regs)) diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index da815d295239..11661cbc11a8 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -47,7 +47,7 @@ void *kmap_atomic(struct page *page) unsigned long vaddr; int idx, type; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -72,6 +72,7 @@ void __kunmap_atomic(void *kvaddr) if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -92,6 +93,7 @@ void __kunmap_atomic(void *kvaddr) #endif kmap_atomic_idx_pop(); pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); @@ -104,6 +106,7 @@ void *kmap_atomic_pfn(unsigned long pfn) unsigned long vaddr; int idx, type; + preempt_disable(); pagefault_disable(); type = kmap_atomic_idx_push(); diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index faa5c9822ecc..198a3147dd7d 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -90,6 +90,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot) BUG_ON(Page_dcache_dirty(page)); + preempt_disable(); pagefault_disable(); idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1); idx += in_interrupt() ? FIX_N_COLOURS : 0; @@ -152,6 +153,7 @@ void kunmap_coherent(void) write_c0_entryhi(old_ctx); local_irq_restore(flags); pagefault_enable(); + preempt_enable(); } void copy_user_highpage(struct page *to, struct page *from, diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h index 2fbbe4d920aa..1ddea5afba09 100644 --- a/arch/mn10300/include/asm/highmem.h +++ b/arch/mn10300/include/asm/highmem.h @@ -75,6 +75,7 @@ static inline void *kmap_atomic(struct page *page) unsigned long vaddr; int idx, type; + preempt_disable(); pagefault_disable(); if (page < highmem_start_page) return page_address(page); @@ -98,6 +99,7 @@ static inline void __kunmap_atomic(unsigned long vaddr) if (vaddr < FIXADDR_START) { /* FIXME */ pagefault_enable(); + preempt_enable(); return; } @@ -122,6 +124,7 @@ static inline void __kunmap_atomic(unsigned long vaddr) kmap_atomic_idx_pop(); pagefault_enable(); + preempt_enable(); } #endif /* __KERNEL__ */ diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 0c2cc5d39c8e..4a1d181ed32f 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -23,8 +23,8 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/pgalloc.h> #include <asm/hardirq.h> #include <asm/cpu-regs.h> @@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code, * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index 0c9b6afe69e9..b51878b0c6b8 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -77,7 +77,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto bad_area_nosemaphore; if (user_mode(regs)) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index de65f66ea64e..ec2df4bab302 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -142,6 +142,7 @@ static inline void kunmap(struct page *page) static inline void *kmap_atomic(struct page *page) { + preempt_disable(); pagefault_disable(); return page_address(page); } @@ -150,6 +151,7 @@ static inline void __kunmap_atomic(void *addr) { flush_kernel_dcache_page_addr(addr); pagefault_enable(); + preempt_enable(); } #define kmap_atomic_prot(page, prot) kmap_atomic(page) diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h index dbd13354ec41..0a90b965cccb 100644 --- a/arch/parisc/include/asm/cmpxchg.h +++ b/arch/parisc/include/asm/cmpxchg.h @@ -46,8 +46,6 @@ __xchg(unsigned long x, __volatile__ void *ptr, int size) #define xchg(ptr, x) \ ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) -#define __HAVE_ARCH_CMPXCHG 1 - /* bug catcher for when unsupported size is used - won't link */ extern void __cmpxchg_called_with_bad_pointer(void); diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 47ee620d15d2..6548fd1d2e62 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -26,9 +26,9 @@ #include <linux/console.h> #include <linux/bug.h> #include <linux/ratelimit.h> +#include <linux/uaccess.h> #include <asm/assembly.h> -#include <asm/uaccess.h> #include <asm/io.h> #include <asm/irq.h> #include <asm/traps.h> @@ -800,7 +800,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs) * unless pagefault_disable() was called before. */ - if (fault_space == 0 && !in_atomic()) + if (fault_space == 0 && !faulthandler_disabled()) { pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC); parisc_terminate("Kernel Fault", regs, code, fault_address); diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index e5120e653240..15503adddf4f 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -15,8 +15,8 @@ #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/traps.h> /* Various important other fields */ @@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, int fault; unsigned int flags; - if (in_atomic()) + if (pagefault_disabled()) goto no_context; tsk = current; diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index a3bf5be111ff..39505d660a70 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -34,7 +34,7 @@ #define rmb() __asm__ __volatile__ ("sync" : : : "memory") #define wmb() __asm__ __volatile__ ("sync" : : : "memory") -#define set_mb(var, value) do { var = value; mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) #ifdef __SUBARCH_HAS_LWSYNC # define SMPWMB LWSYNC diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index d463c68fe7f0..ad6263cffb0f 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -144,7 +144,6 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size) * Compare and exchange - if *p == old, set it to new, * and return the old value of *p. */ -#define __HAVE_ARCH_CMPXCHG 1 static __always_inline unsigned long __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index 3cf529ceec5b..ac93a3bd2730 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -27,11 +27,11 @@ int enter_vmx_usercopy(void) if (in_interrupt()) return 0; - /* This acts as preempt_disable() as well and will make - * enable_kernel_altivec(). We need to disable page faults - * as they can call schedule and thus make us lose the VMX - * context. So on page faults, we just fail which will cause - * a fallback to the normal non-vmx copy. + preempt_disable(); + /* + * We need to disable page faults as they can call schedule and + * thus make us lose the VMX context. So on page faults, we just + * fail which will cause a fallback to the normal non-vmx copy. */ pagefault_disable(); @@ -47,6 +47,7 @@ int enter_vmx_usercopy(void) int exit_vmx_usercopy(void) { pagefault_enable(); + preempt_enable(); return 0; } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b396868d2aa7..6d535973b200 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -33,13 +33,13 @@ #include <linux/ratelimit.h> #include <linux/context_tracking.h> #include <linux/hugetlb.h> +#include <linux/uaccess.h> #include <asm/firmware.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/siginfo.h> #include <asm/debug.h> @@ -272,15 +272,16 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, if (!arch_irq_disabled_regs(regs)) local_irq_enable(); - if (in_atomic() || mm == NULL) { + if (faulthandler_disabled() || mm == NULL) { if (!user_mode(regs)) { rc = SIGSEGV; goto bail; } - /* in_atomic() in user mode is really bad, + /* faulthandler_disabled() in user mode is really bad, as is current->mm == NULL. */ printk(KERN_EMERG "Page fault in user mode with " - "in_atomic() = %d mm = %p\n", in_atomic(), mm); + "faulthandler_disabled() = %d mm = %p\n", + faulthandler_disabled(), mm); printk(KERN_EMERG "NIP = %lx MSR = %lx\n", regs->nip, regs->msr); die("Weird page fault", regs, SIGSEGV); diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index e7450bdbe83a..e292c8a60952 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -34,7 +34,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) unsigned long vaddr; int idx, type; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -59,6 +59,7 @@ void __kunmap_atomic(void *kvaddr) if (vaddr < __fix_to_virt(FIX_KMAP_END)) { pagefault_enable(); + preempt_enable(); return; } @@ -82,5 +83,6 @@ void __kunmap_atomic(void *kvaddr) kmap_atomic_idx_pop(); pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index 8d724718ec21..e6f8615a11eb 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -36,7 +36,7 @@ #define smp_mb__before_atomic() smp_mb() #define smp_mb__after_atomic() smp_mb() -#define set_mb(var, value) do { var = value; mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) #define smp_store_release(p, v) \ do { \ diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h index 4eadec466b8c..411464f4c97a 100644 --- a/arch/s390/include/asm/cmpxchg.h +++ b/arch/s390/include/asm/cmpxchg.h @@ -32,8 +32,6 @@ __old; \ }) -#define __HAVE_ARCH_CMPXCHG - #define __cmpxchg_double_op(p1, p2, o1, o2, n1, n2, insn) \ ({ \ register __typeof__(*(p1)) __old1 asm("2") = (o1); \ diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 98eb2a579223..dcb6312a0b91 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -10,6 +10,7 @@ #define _ASM_S390_TIMEX_H #include <asm/lowcore.h> +#include <linux/time64.h> /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL @@ -108,10 +109,10 @@ int get_sync_clock(unsigned long long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); -void tod_to_timeval(__u64, struct timespec *); +void tod_to_timeval(__u64 todval, struct timespec64 *xt); static inline -void stck_to_timespec(unsigned long long stck, struct timespec *ts) +void stck_to_timespec64(unsigned long long stck, struct timespec64 *ts) { tod_to_timeval(stck - TOD_UNIX_EPOCH, ts); } diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index d64a7a62164f..9dd4cc47ddc7 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -98,7 +98,8 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x) * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. Caller must check * the specified block with access_ok() before calling this function. @@ -118,7 +119,8 @@ unsigned long __must_check __copy_from_user(void *to, const void __user *from, * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. @@ -264,7 +266,8 @@ int __get_user_bad(void) __attribute__((noreturn)); * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. * @@ -290,7 +293,8 @@ __compiletime_warning("copy_from_user() buffer size is not provably correct") * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. * @@ -348,7 +352,8 @@ static inline unsigned long strnlen_user(const char __user *src, unsigned long n * strlen_user: - Get the size of a string in user space. * @str: The string to measure. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Get the size of a NUL-terminated string in user space. * diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index c1f21aca76e7..6fca0e46464e 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -1457,23 +1457,24 @@ int debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, int area, debug_entry_t * entry, char *out_buf) { - struct timespec time_spec; + struct timespec64 time_spec; char *except_str; unsigned long caller; int rc = 0; unsigned int level; level = entry->id.fields.level; - stck_to_timespec(entry->id.stck, &time_spec); + stck_to_timespec64(entry->id.stck, &time_spec); if (entry->id.fields.exception) except_str = "*"; else except_str = "-"; caller = ((unsigned long) entry->caller) & PSW_ADDR_INSN; - rc += sprintf(out_buf, "%02i %011lu:%06lu %1u %1s %02i %p ", - area, time_spec.tv_sec, time_spec.tv_nsec / 1000, level, - except_str, entry->id.fields.cpuid, (void *) caller); + rc += sprintf(out_buf, "%02i %011lld:%06lu %1u %1s %02i %p ", + area, (long long)time_spec.tv_sec, + time_spec.tv_nsec / 1000, level, except_str, + entry->id.fields.cpuid, (void *)caller); return rc; } EXPORT_SYMBOL(debug_dflt_header_fn); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 170ddd2018b3..9e733d965e08 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -76,7 +76,7 @@ unsigned long long monotonic_clock(void) } EXPORT_SYMBOL(monotonic_clock); -void tod_to_timeval(__u64 todval, struct timespec *xt) +void tod_to_timeval(__u64 todval, struct timespec64 *xt) { unsigned long long sec; @@ -181,12 +181,12 @@ static void timing_alert_interrupt(struct ext_code ext_code, static void etr_reset(void); static void stp_reset(void); -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { tod_to_timeval(get_tod_clock() - TOD_UNIX_EPOCH, ts); } -void read_boot_clock(struct timespec *ts) +void read_boot_clock64(struct timespec64 *ts) { tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts); } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 76515bcea2f1..4c8f5d7f9c23 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -399,7 +399,7 @@ static inline int do_exception(struct pt_regs *regs, int access) * user context. */ fault = VM_FAULT_BADCONTEXT; - if (unlikely(!user_space_fault(regs) || in_atomic() || !mm)) + if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm)) goto out; address = trans_exc_code & __FAIL_ADDR_MASK; diff --git a/arch/score/include/asm/cmpxchg.h b/arch/score/include/asm/cmpxchg.h index f384839c3ee5..cc3f6420b71c 100644 --- a/arch/score/include/asm/cmpxchg.h +++ b/arch/score/include/asm/cmpxchg.h @@ -42,8 +42,6 @@ static inline unsigned long __cmpxchg(volatile unsigned long *m, (unsigned long)(o), \ (unsigned long)(n))) -#define __HAVE_ARCH_CMPXCHG 1 - #include <asm-generic/cmpxchg-local.h> #endif /* _ASM_SCORE_CMPXCHG_H */ diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h index ab66ddde777b..20a3591225cc 100644 --- a/arch/score/include/asm/uaccess.h +++ b/arch/score/include/asm/uaccess.h @@ -36,7 +36,8 @@ * @addr: User space pointer to start of block to check * @size: Size of block to check * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Checks if a pointer to a block of memory in user space is valid. * @@ -61,7 +62,8 @@ * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -79,7 +81,8 @@ * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -98,7 +101,8 @@ * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -119,7 +123,8 @@ * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 6860beb2a280..37a6c2e0e969 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c @@ -34,6 +34,7 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/ptrace.h> +#include <linux/uaccess.h> /* * This routine handles page faults. It determines the address, @@ -73,7 +74,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (pagefault_disabled() || !mm) goto bad_area_nosemaphore; if (user_mode(regs)) diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h index 43715308b068..bf91037db4e0 100644 --- a/arch/sh/include/asm/barrier.h +++ b/arch/sh/include/asm/barrier.h @@ -32,7 +32,7 @@ #define ctrl_barrier() __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop") #endif -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #include <asm-generic/barrier.h> diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h index f6bd1406b897..85c97b188d71 100644 --- a/arch/sh/include/asm/cmpxchg.h +++ b/arch/sh/include/asm/cmpxchg.h @@ -46,8 +46,6 @@ extern void __xchg_called_with_bad_pointer(void); * if something tries to do an invalid cmpxchg(). */ extern void __cmpxchg_called_with_bad_pointer(void); -#define __HAVE_ARCH_CMPXCHG 1 - static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old, unsigned long new, int size) { diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index a58fec9b55e0..79d8276377d1 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -17,6 +17,7 @@ #include <linux/kprobes.h> #include <linux/perf_event.h> #include <linux/kdebug.h> +#include <linux/uaccess.h> #include <asm/io_trapped.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -438,9 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, /* * If we're in an interrupt, have no user context or are running - * in an atomic region then we must not take the fault: + * with pagefaults disabled then we must not take the fault: */ - if (unlikely(in_atomic() || !mm)) { + if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h index 76648941fea7..809941e33e12 100644 --- a/arch/sparc/include/asm/barrier_64.h +++ b/arch/sparc/include/asm/barrier_64.h @@ -40,8 +40,8 @@ do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ #define dma_rmb() rmb() #define dma_wmb() wmb() -#define set_mb(__var, __value) \ - do { __var = __value; membar_safe("#StoreLoad"); } while(0) +#define smp_store_mb(__var, __value) \ + do { WRITE_ONCE(__var, __value); membar_safe("#StoreLoad"); } while(0) #ifdef CONFIG_SMP #define smp_mb() mb() diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h index d38b52dca216..83ffb83c5397 100644 --- a/arch/sparc/include/asm/cmpxchg_32.h +++ b/arch/sparc/include/asm/cmpxchg_32.h @@ -34,7 +34,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int * * Cribbed from <asm-parisc/atomic.h> */ -#define __HAVE_ARCH_CMPXCHG 1 /* bug catcher for when unsupported size is used - won't link */ void __cmpxchg_called_with_bad_pointer(void); diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h index 0e1ed6cfbf68..faa2f61058c2 100644 --- a/arch/sparc/include/asm/cmpxchg_64.h +++ b/arch/sparc/include/asm/cmpxchg_64.h @@ -65,8 +65,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, #include <asm-generic/cmpxchg-local.h> -#define __HAVE_ARCH_CMPXCHG 1 - static inline unsigned long __cmpxchg_u32(volatile int *m, int old, int new) { diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 70d817154fe8..c399e7b3b035 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -21,6 +21,7 @@ #include <linux/perf_event.h> #include <linux/interrupt.h> #include <linux/kdebug.h> +#include <linux/uaccess.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -29,7 +30,6 @@ #include <asm/setup.h> #include <asm/smp.h> #include <asm/traps.h> -#include <asm/uaccess.h> #include "mm_32.h" @@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (pagefault_disabled() || !mm) goto no_context; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 479823249429..e9268ea1a68d 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -22,12 +22,12 @@ #include <linux/kdebug.h> #include <linux/percpu.h> #include <linux/context_tracking.h> +#include <linux/uaccess.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/openprom.h> #include <asm/oplib.h> -#include <asm/uaccess.h> #include <asm/asi.h> #include <asm/lsu.h> #include <asm/sections.h> @@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto intr_or_no_mm; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 449f864f0cef..a454ec5ff07a 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page) unsigned long vaddr; long idx, type; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -91,6 +91,7 @@ void __kunmap_atomic(void *kvaddr) if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -126,5 +127,6 @@ void __kunmap_atomic(void *kvaddr) kmap_atomic_idx_pop(); pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 4ca0d6ba5ec8..cee9b77ddd05 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2706,7 +2706,7 @@ void hugetlb_setup(struct pt_regs *regs) struct mm_struct *mm = current->mm; struct tsb_config *tp; - if (in_atomic() || !mm) { + if (faulthandler_disabled() || !mm) { const struct exception_table_entry *entry; entry = search_exception_tables(regs->tpc); diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h index 7b11c5fadd42..0496970cef82 100644 --- a/arch/tile/include/asm/atomic_64.h +++ b/arch/tile/include/asm/atomic_64.h @@ -105,9 +105,6 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u) #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) -/* Define this to indicate that cmpxchg is an efficient operation. */ -#define __HAVE_ARCH_CMPXCHG - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_TILE_ATOMIC_64_H */ diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h index d598c11a82d9..0a9c4265763b 100644 --- a/arch/tile/include/asm/uaccess.h +++ b/arch/tile/include/asm/uaccess.h @@ -85,7 +85,8 @@ int __range_ok(unsigned long addr, unsigned long size); * @addr: User space pointer to start of block to check * @size: Size of block to check * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Checks if a pointer to a block of memory in user space is valid. * @@ -199,7 +200,8 @@ extern int __get_user_bad(void) * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -281,7 +283,8 @@ extern int __put_user_bad(void) * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -337,7 +340,8 @@ extern int __put_user_bad(void) * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. @@ -373,7 +377,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n) * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. Caller must check * the specified block with access_ok() before calling this function. @@ -444,7 +449,8 @@ static inline unsigned long __must_check copy_from_user(void *to, * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to user space. Caller must check * the specified blocks with access_ok() before calling this function. diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index e83cc999da02..3f4f58d34a92 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -354,9 +354,9 @@ static int handle_page_fault(struct pt_regs *regs, /* * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * region with pagefaults disabled then we must not take the fault. */ - if (in_atomic() || !mm) { + if (pagefault_disabled() || !mm) { vma = NULL; /* happy compiler */ goto bad_area_nosemaphore; } diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c index 6aa2f2625447..fcd545014e79 100644 --- a/arch/tile/mm/highmem.c +++ b/arch/tile/mm/highmem.c @@ -201,7 +201,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) int idx, type; pte_t *pte; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); /* Avoid icache flushes by disallowing atomic executable mappings. */ @@ -259,6 +259,7 @@ void __kunmap_atomic(void *kvaddr) } pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 8e4daf44e980..47ff9b7f3e5d 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -7,6 +7,7 @@ #include <linux/sched.h> #include <linux/hardirq.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/current.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -35,10 +36,10 @@ int handle_page_fault(unsigned long address, unsigned long ip, *code_out = SEGV_MAPERR; /* - * If the fault was during atomic operation, don't take the fault, just + * If the fault was with pagefaults disabled, don't take the fault, just * fail. */ - if (in_atomic()) + if (faulthandler_disabled()) goto out_nosemaphore; if (is_user) diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index 0dc922dba915..afccef5529cc 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -218,7 +218,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93b592cea001..fb3f245bb717 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,7 +127,8 @@ config X86 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_QUEUE_RWLOCK + select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_USE_QUEUED_RWLOCKS select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select OLD_SIGACTION if X86_32 select COMPAT_OLD_SIGACTION if IA32_EMULATION @@ -341,7 +342,7 @@ config X86_FEATURE_NAMES config X86_X2APIC bool "Support x2apic" - depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP + depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST) ---help--- This enables x2apic support on CPUs that have this feature. @@ -441,6 +442,7 @@ config X86_UV depends on X86_EXTENDED_PLATFORM depends on NUMA depends on X86_X2APIC + depends on PCI ---help--- This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. @@ -466,7 +468,6 @@ config X86_INTEL_CE select X86_REBOOTFIXUPS select OF select OF_EARLY_FLATTREE - select IRQ_DOMAIN ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -666,7 +667,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP - select UNINLINE_SPIN_UNLOCK + select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -851,11 +852,12 @@ config NR_CPUS default "1" if !SMP default "8192" if MAXSMP default "32" if SMP && X86_BIGSMP - default "8" if SMP + default "8" if SMP && X86_32 + default "64" if SMP ---help--- This allows you to specify the maximum number of CPUs which this kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum - supported value is 4096, otherwise the maximum value is 512. The + supported value is 8192, otherwise the maximum value is 512. The minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds @@ -914,12 +916,12 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI - select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + select IRQ_DOMAIN_HIERARCHY + select PCI_MSI_IRQ_DOMAIN if PCI_MSI config X86_IO_APIC def_bool y depends on X86_LOCAL_APIC || X86_UP_IOAPIC - select IRQ_DOMAIN config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 72484a645f05..a15893d17c55 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -332,4 +332,27 @@ config X86_DEBUG_STATIC_CPU_HAS If unsure, say N. +config X86_DEBUG_FPU + bool "Debug the x86 FPU code" + depends on DEBUG_KERNEL + default y + ---help--- + If this option is enabled then there will be extra sanity + checks and (boot time) debug printouts added to the kernel. + This debugging adds some small amount of runtime overhead + to the kernel. + + If unsure, say N. + +config PUNIT_ATOM_DEBUG + tristate "ATOM Punit debug driver" + select DEBUG_FS + select IOSF_MBI + ---help--- + This is a debug driver, which gets the power states + of all Punit North Complex devices. The power states of + each device is exposed as part of the debugfs interface. + The current power state can be read from + /sys/kernel/debug/punit_atom/dev_power_state + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2fda005bb334..57996ee840dd 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -77,6 +77,12 @@ else KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 + # Align jump targets to 1 byte, not the default 16 bytes: + KBUILD_CFLAGS += -falign-jumps=1 + + # Pack loops tightly as well: + KBUILD_CFLAGS += -falign-loops=1 + # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) @@ -84,6 +90,9 @@ else # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) + # Use -mskip-rax-setup if supported. + KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2ade24000246..707c58614617 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -32,7 +32,7 @@ #include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> #include <asm/crypto/aes.h> #include <crypto/ablk_helper.h> #include <crypto/scatterwalk.h> diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index baf0ac21ace5..4c65c70e628b 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -19,8 +19,7 @@ #include <crypto/ctr.h> #include <crypto/lrw.h> #include <crypto/xts.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <asm/crypto/camellia.h> #include <asm/crypto/glue_helper.h> @@ -561,16 +560,15 @@ static struct crypto_alg cmll_algs[10] = { { static int __init camellia_aesni_init(void) { - u64 xcr0; + const char *feature_name; if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); return -ENODEV; } - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX2 detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 78818a1e73e3..80a0e4389c9a 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -19,8 +19,7 @@ #include <crypto/ctr.h> #include <crypto/lrw.h> #include <crypto/xts.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <asm/crypto/camellia.h> #include <asm/crypto/glue_helper.h> @@ -553,16 +552,10 @@ static struct crypto_alg cmll_algs[10] = { { static int __init camellia_aesni_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { - pr_info("AVX or AES-NI instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 236c80974457..be00aa48b2b5 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -31,8 +31,7 @@ #include <crypto/cast5.h> #include <crypto/cryptd.h> #include <crypto/ctr.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <asm/crypto/glue_helper.h> #define CAST5_PARALLEL_BLOCKS 16 @@ -468,16 +467,10 @@ static struct crypto_alg cast5_algs[6] = { { static int __init cast5_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - pr_info("AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index f448810ca4ac..5dbba7224221 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -36,8 +36,7 @@ #include <crypto/ctr.h> #include <crypto/lrw.h> #include <crypto/xts.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <asm/crypto/glue_helper.h> #define CAST6_PARALLEL_BLOCKS 8 @@ -590,16 +589,10 @@ static struct crypto_alg cast6_algs[10] = { { static int __init cast6_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - pr_info("AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 1937fc1d8763..07d2c6c86a54 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -35,7 +35,7 @@ #include <asm/cpufeature.h> #include <asm/cpu_device_id.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 28640c3d6af7..81a595d75cf5 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -32,8 +32,7 @@ #include <asm/cpufeature.h> #include <asm/cpu_device_id.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index b6c67bf30fdf..a3fcfc97a311 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -29,7 +29,7 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/kernel.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> #include <asm/cpufeature.h> #include <asm/cpu_device_id.h> diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index f368ba261739..5a2f30f9f52d 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -18,7 +18,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/crypto.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> struct crypto_fpu_ctx { struct crypto_blkcipher *child; diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 2079baf06bdd..64d7cf1b50e1 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -19,7 +19,7 @@ #include <crypto/cryptd.h> #include <crypto/gf128mul.h> #include <crypto/internal/hash.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> #include <asm/cpu_device_id.h> #define GHASH_BLOCK_SIZE 16 diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 2f63dc89e7a9..7d838dc4d888 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -20,8 +20,7 @@ #include <crypto/lrw.h> #include <crypto/xts.h> #include <crypto/serpent.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <asm/crypto/serpent-avx.h> #include <asm/crypto/glue_helper.h> @@ -537,16 +536,14 @@ static struct crypto_alg srp_algs[10] = { { static int __init init(void) { - u64 xcr0; + const char *feature_name; if (!cpu_has_avx2 || !cpu_has_osxsave) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index c8d478af8456..da7dafc9b16d 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -36,8 +36,7 @@ #include <crypto/ctr.h> #include <crypto/lrw.h> #include <crypto/xts.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <asm/crypto/serpent-avx.h> #include <asm/crypto/glue_helper.h> @@ -596,16 +595,10 @@ static struct crypto_alg serpent_algs[10] = { { static int __init serpent_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - printk(KERN_INFO "AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - printk(KERN_INFO "AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 0cb5149518ce..a841e9765bd6 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -65,11 +65,8 @@ #include <crypto/mcryptd.h> #include <crypto/crypto_wq.h> #include <asm/byteorder.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/xsave.h> #include <linux/hardirq.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/api.h> #include "sha_mb_ctx.h" #define FLUSH_INTERVAL 1000 /* in usec */ diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 33d1b9dc14cc..7c48e8b20848 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -29,9 +29,7 @@ #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha1_base.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, @@ -123,15 +121,9 @@ static struct shash_alg alg = { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - u64 xcr0; - - if (!cpu_has_avx || !cpu_has_osxsave) - return false; - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index ccc338881ee8..f8097fc0d1d1 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -37,9 +37,7 @@ #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <linux/string.h> asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, @@ -132,15 +130,9 @@ static struct shash_alg algs[] = { { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - u64 xcr0; - - if (!cpu_has_avx || !cpu_has_osxsave) - return false; - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index d9fa4c1e063f..2edad7b81870 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -35,9 +35,7 @@ #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha512_base.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <linux/string.h> @@ -131,15 +129,9 @@ static struct shash_alg algs[] = { { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - u64 xcr0; - - if (!cpu_has_avx || !cpu_has_osxsave) - return false; - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index b5e2d5651851..c2bd0ce718ee 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -36,9 +36,7 @@ #include <crypto/ctr.h> #include <crypto/lrw.h> #include <crypto/xts.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/xsave.h> +#include <asm/fpu/api.h> #include <asm/crypto/twofish.h> #include <asm/crypto/glue_helper.h> #include <crypto/scatterwalk.h> @@ -558,16 +556,10 @@ static struct crypto_alg twofish_algs[10] = { { static int __init twofish_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - printk(KERN_INFO "AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - printk(KERN_INFO "AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index c81d35e6c7f1..ae3a29ae875b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -21,8 +21,8 @@ #include <linux/binfmts.h> #include <asm/ucontext.h> #include <asm/uaccess.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> +#include <asm/fpu/signal.h> #include <asm/ptrace.h> #include <asm/ia32_unistd.h> #include <asm/user32.h> @@ -198,7 +198,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); } get_user_catch(err); - err |= restore_xstate_sig(buf, 1); + err |= fpu__restore_sig(buf, 1); force_iret(); @@ -308,6 +308,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { + struct fpu *fpu = ¤t->thread.fpu; unsigned long sp; /* Default to using normal stack */ @@ -322,12 +323,12 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (used_math()) { + if (fpu->fpstate_active) { unsigned long fx_aligned, math_size; - sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); + sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); *fpstate = (struct _fpstate_ia32 __user *) sp; - if (save_xstate_sig(*fpstate, (void __user *)fx_aligned, + if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, math_size) < 0) return (void __user *) -1L; } diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 72bf2680f819..63450a596800 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -77,12 +77,6 @@ ENTRY(native_usergs_sysret32) swapgs sysretl ENDPROC(native_usergs_sysret32) - -ENTRY(native_irq_enable_sysexit) - swapgs - sti - sysexit -ENDPROC(native_irq_enable_sysexit) #endif /* @@ -119,7 +113,7 @@ ENTRY(ia32_sysenter_target) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ @@ -142,7 +136,7 @@ ENTRY(ia32_sysenter_target) pushq_cfi_reg rsi /* pt_regs->si */ pushq_cfi_reg rdx /* pt_regs->dx */ pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi_reg rax /* pt_regs->ax */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ cld sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ CFI_ADJUST_CFA_OFFSET 10*8 @@ -169,8 +163,6 @@ sysenter_flags_fixed: testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz sysenter_tracesys - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys sysenter_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -179,8 +171,11 @@ sysenter_do_call: movl %ebx,%edi /* arg1 */ movl %edx,%edx /* arg3 (zero extension) */ sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f call *ia32_sys_call_table(,%rax,8) movq %rax,RAX(%rsp) +1: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -247,9 +242,7 @@ sysexit_from_sys_call: movl %ebx,%esi /* 2nd arg: 1st syscall arg */ movl %eax,%edi /* 1st arg: syscall number */ call __audit_syscall_entry - movl RAX(%rsp),%eax /* reload syscall number */ - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys + movl ORIG_RAX(%rsp),%eax /* reload syscall number */ movl %ebx,%edi /* reload 1st syscall arg */ movl RCX(%rsp),%esi /* reload 2nd syscall arg */ movl RDX(%rsp),%edx /* reload 3rd syscall arg */ @@ -300,13 +293,10 @@ sysenter_tracesys: #endif SAVE_EXTRA_REGS CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call CFI_ENDPROC ENDPROC(ia32_sysenter_target) @@ -356,7 +346,7 @@ ENTRY(ia32_cstar_target) SWAPGS_UNSAFE_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 - movq PER_CPU_VAR(kernel_stack),%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ @@ -376,7 +366,7 @@ ENTRY(ia32_cstar_target) pushq_cfi_reg rdx /* pt_regs->dx */ pushq_cfi_reg rbp /* pt_regs->cx */ movl %ebp,%ecx - pushq_cfi_reg rax /* pt_regs->ax */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ CFI_ADJUST_CFA_OFFSET 10*8 @@ -392,8 +382,6 @@ ENTRY(ia32_cstar_target) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz cstar_tracesys - cmpq $IA32_NR_syscalls-1,%rax - ja ia32_badsys cstar_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -402,8 +390,11 @@ cstar_do_call: movl %ebx,%edi /* arg1 */ movl %edx,%edx /* arg3 (zero extension) */ cstar_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f call *ia32_sys_call_table(,%rax,8) movq %rax,RAX(%rsp) +1: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -457,14 +448,11 @@ cstar_tracesys: xchgl %r9d,%ebp SAVE_EXTRA_REGS CLEAR_RREGS r9 - movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS xchgl %ebp,%r9d - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call END(ia32_cstar_target) @@ -523,7 +511,7 @@ ENTRY(ia32_syscall) pushq_cfi_reg rsi /* pt_regs->si */ pushq_cfi_reg rdx /* pt_regs->dx */ pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi_reg rax /* pt_regs->ax */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ cld sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ CFI_ADJUST_CFA_OFFSET 10*8 @@ -531,8 +519,6 @@ ENTRY(ia32_syscall) orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_tracesys - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys ia32_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -540,9 +526,12 @@ ia32_do_call: xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ movl %ebx,%edi /* arg1 */ movl %edx,%edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: movq %rax,RAX(%rsp) +1: ia32_ret_from_sys_call: CLEAR_RREGS jmp int_ret_from_sys_call @@ -550,23 +539,14 @@ ia32_ret_from_sys_call: ia32_tracesys: SAVE_EXTRA_REGS CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call + CFI_ENDPROC END(ia32_syscall) -ia32_badsys: - movq $0,ORIG_RAX(%rsp) - movq $-ENOSYS,%rax - jmp ia32_sysret - - CFI_ENDPROC - .macro PTREGSCALL label, func ALIGN GLOBAL(\label) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index bdf02eeee765..e7636bac7372 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -18,6 +18,12 @@ .endm #endif +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ .macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . @@ -27,6 +33,12 @@ .byte \pad_len .endm +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ .macro ALTERNATIVE oldinstr, newinstr, feature 140: \oldinstr @@ -55,6 +67,12 @@ */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + +/* + * Same as ALTERNATIVE macro above but for two alternatives. If CPU + * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has + * @feature2, it replaces @oldinstr with @feature2. + */ .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 140: \oldinstr diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ba32af062f61..7bfc85bbb8ff 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -52,6 +52,12 @@ struct alt_instr { u8 padlen; /* length of build-time padding */ } __packed; +/* + * Debug flag that can be tested to see whether alternative + * instructions were patched in already: + */ +extern int alternatives_patched; + extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index aaac3b2fb746..1a5da2e63aee 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -98,11 +98,22 @@ static inline u16 amd_get_node_id(struct pci_dev *pdev) return 0; } +static inline bool amd_gart_present(void) +{ + /* GART present only on Fam15h, upto model 0fh */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) + return true; + + return false; +} + #else #define amd_nb_num(x) 0 #define amd_nb_has_feature(x) false #define node_to_amd_nb(x) NULL +#define amd_gart_present(x) false #endif diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 976b86a325e5..c8393634ca0c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -644,6 +644,12 @@ static inline void entering_ack_irq(void) entering_irq(); } +static inline void ipi_entering_ack_irq(void) +{ + ack_APIC_irq(); + irq_enter(); +} + static inline void exiting_irq(void) { irq_exit(); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7730c1c5c83a..189679aba703 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -63,6 +63,31 @@ _ASM_ALIGN ; \ _ASM_PTR (entry); \ .popsection + +.macro ALIGN_DESTINATION + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) + .endm + #else # define _ASM_EXTABLE(from,to) \ " .pushsection \"__ex_table\",\"a\"\n" \ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 5e5cd123fdfb..e9168955c42f 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -22,7 +22,7 @@ * * Atomically reads the value of @v. */ -static inline int atomic_read(const atomic_t *v) +static __always_inline int atomic_read(const atomic_t *v) { return ACCESS_ONCE((v)->counter); } @@ -34,7 +34,7 @@ static inline int atomic_read(const atomic_t *v) * * Atomically sets the value of @v to @i. */ -static inline void atomic_set(atomic_t *v, int i) +static __always_inline void atomic_set(atomic_t *v, int i) { v->counter = i; } @@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i) * * Atomically adds @i to @v. */ -static inline void atomic_add(int i, atomic_t *v) +static __always_inline void atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) @@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v) * * Atomically subtracts @i from @v. */ -static inline void atomic_sub(int i, atomic_t *v) +static __always_inline void atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) @@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -static inline int atomic_sub_and_test(int i, atomic_t *v) +static __always_inline int atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e"); } @@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v) * * Atomically increments @v by 1. */ -static inline void atomic_inc(atomic_t *v) +static __always_inline void atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); @@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v) * * Atomically decrements @v by 1. */ -static inline void atomic_dec(atomic_t *v) +static __always_inline void atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); @@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic_dec_and_test(atomic_t *v) +static __always_inline int atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); } @@ -126,7 +126,7 @@ static inline int atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic_inc_and_test(atomic_t *v) +static __always_inline int atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); } @@ -140,7 +140,7 @@ static inline int atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int atomic_add_negative(int i, atomic_t *v) +static __always_inline int atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s"); } @@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static inline int atomic_add_return(int i, atomic_t *v) +static __always_inline int atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } @@ -164,7 +164,7 @@ static inline int atomic_add_return(int i, atomic_t *v) * * Atomically subtracts @i from @v and returns @v - @i */ -static inline int atomic_sub_return(int i, atomic_t *v) +static __always_inline int atomic_sub_return(int i, atomic_t *v) { return atomic_add_return(-i, v); } @@ -172,7 +172,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); } @@ -191,7 +191,7 @@ static inline int atomic_xchg(atomic_t *v, int new) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); @@ -213,7 +213,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) * Atomically adds 1 to @v * Returns the new value of @u */ -static inline short int atomic_inc_short(short int *v) +static __always_inline short int atomic_inc_short(short int *v) { asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); return *v; diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index f8d273e18516..b965f9e03f2a 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i) * * Atomically adds @i to @v. */ -static inline void atomic64_add(long i, atomic64_t *v) +static __always_inline void atomic64_add(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) @@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v) * * Atomically increments @v by 1. */ -static inline void atomic64_inc(atomic64_t *v) +static __always_inline void atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) @@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ -static inline void atomic64_dec(atomic64_t *v) +static __always_inline void atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) @@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static inline long atomic64_add_return(long i, atomic64_t *v) +static __always_inline long atomic64_add_return(long i, atomic64_t *v) { return i + xadd(&v->counter, i); } diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 959e45b81fe2..e51a8f803f55 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -35,12 +35,12 @@ #define smp_mb() mb() #define smp_rmb() dma_rmb() #define smp_wmb() barrier() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #endif /* SMP */ #define read_barrier_depends() do { } while (0) diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 99c105d78b7e..ad19841eddfe 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -4,8 +4,6 @@ #include <linux/compiler.h> #include <asm/alternative.h> /* Provides LOCK_PREFIX */ -#define __HAVE_ARCH_CMPXCHG 1 - /* * Non-existant functions to indicate usage errors at link time * (or compile-time if the compiler implements __compiletime_error(). diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 1eef55596e82..03bb1065c335 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <linux/crypto.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> #include <crypto/b128ops.h> typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 808dae63eeea..1f5b7287d1ad 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -127,50 +127,14 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) #define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) -static inline void * +void * dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - void *memory; - - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - - if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) - dev = &x86_dma_fallback_dev; - - if (!is_device_dma_capable(dev)) - return NULL; - - if (!ops->alloc) - return NULL; - - memory = ops->alloc(dev, size, dma_handle, - dma_alloc_coherent_gfp_flags(dev, gfp), attrs); - debug_dma_alloc_coherent(dev, size, *dma_handle, memory); - - return memory; -} + gfp_t gfp, struct dma_attrs *attrs); #define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) -static inline void dma_free_attrs(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - WARN_ON(irqs_disabled()); /* for portability */ - - if (dma_release_from_coherent(dev, get_order(size), vaddr)) - return; - - debug_dma_free_coherent(dev, size, vaddr, bus); - if (ops->free) - ops->free(dev, size, vaddr, bus, attrs); -} +void dma_free_attrs(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus, + struct dma_attrs *attrs); #endif diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 3738b138b843..155162ea0e00 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H -#include <asm/i387.h> +#include <asm/fpu/api.h> #include <asm/pgtable.h> /* diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index dc5fa661465f..df002992d8fd 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_HAVE_KVM BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, smp_kvm_posted_intr_ipi) +BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, + smp_kvm_posted_intr_wakeup_ipi) #endif /* @@ -50,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_AMD +BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) +#endif #endif diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h deleted file mode 100644 index da5e96756570..000000000000 --- a/arch/x86/include/asm/fpu-internal.h +++ /dev/null @@ -1,626 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _FPU_INTERNAL_H -#define _FPU_INTERNAL_H - -#include <linux/kernel_stat.h> -#include <linux/regset.h> -#include <linux/compat.h> -#include <linux/slab.h> -#include <asm/asm.h> -#include <asm/cpufeature.h> -#include <asm/processor.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/uaccess.h> -#include <asm/xsave.h> -#include <asm/smap.h> - -#ifdef CONFIG_X86_64 -# include <asm/sigcontext32.h> -# include <asm/user32.h> -struct ksignal; -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -#else -# define user_i387_ia32_struct user_i387_struct -# define user32_fxsr_struct user_fxsr_struct -# define ia32_setup_frame __setup_frame -# define ia32_setup_rt_frame __setup_rt_frame -#endif - -extern unsigned int mxcsr_feature_mask; -extern void fpu_init(void); -extern void eager_fpu_init(void); - -DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); - -extern void convert_from_fxsr(struct user_i387_ia32_struct *env, - struct task_struct *tsk); -extern void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env); - -extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; - -/* - * xstateregs_active == fpregs_active. Please refer to the comment - * at the definition of fpregs_active. - */ -#define xstateregs_active fpregs_active - -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - -/* - * Must be run with preemption disabled: this clears the fpu_owner_task, - * on this CPU. - * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. - */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) -{ - per_cpu(fpu_owner_task, cpu) = NULL; -} - -/* - * Used to indicate that the FPU state in memory is newer than the FPU - * state in registers, and the FPU state should be reloaded next time the - * task is run. Only safe on the current task, or non-running tasks. - */ -static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk) -{ - tsk->thread.fpu.last_cpu = ~0; -} - -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) -{ - return new == this_cpu_read_stable(fpu_owner_task) && - cpu == new->thread.fpu.last_cpu; -} - -static inline int is_ia32_compat_frame(void) -{ - return config_enabled(CONFIG_IA32_EMULATION) && - test_thread_flag(TIF_IA32); -} - -static inline int is_ia32_frame(void) -{ - return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); -} - -static inline int is_x32_frame(void) -{ - return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); -} - -#define X87_FSW_ES (1 << 7) /* Exception Summary */ - -static __always_inline __pure bool use_eager_fpu(void) -{ - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); -} - -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has_safe(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has_safe(X86_FEATURE_FXSR); -} - -static inline void fx_finit(struct i387_fxsave_struct *fx) -{ - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; -} - -extern void __sanitize_i387_state(struct task_struct *); - -static inline void sanitize_i387_state(struct task_struct *tsk) -{ - if (!use_xsaveopt()) - return; - __sanitize_i387_state(tsk); -} - -#define user_insn(insn, output, input...) \ -({ \ - int err; \ - asm volatile(ASM_STAC "\n" \ - "1:" #insn "\n\t" \ - "2: " ASM_CLAC "\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define check_insn(insn, output, input...) \ -({ \ - int err; \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -static inline int fsave_user(struct i387_fsave_struct __user *fx) -{ - return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); -} - -static inline int fxsave_user(struct i387_fxsave_struct __user *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); -} - -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), - "m" (*fx)); -} - -static inline int fxrstor_user(struct i387_fxsave_struct __user *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), - "m" (*fx)); -} - -static inline int frstor_checking(struct i387_fsave_struct *fx) -{ - return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_user(struct i387_fsave_struct __user *fx) -{ - return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - if (config_enabled(CONFIG_X86_32)) - asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave)); - else { - /* Using "rex64; fxsave %0" is broken because, if the memory - * operand uses any extended registers for addressing, a second - * REX prefix will be generated (to the assembler, rex64 - * followed by semicolon is a separate instruction), and hence - * the 64-bitness is lost. - * - * Using "fxsaveq %0" would be the ideal choice, but is only - * supported starting with gas 2.16. - * - * Using, as a workaround, the properly prefixed form below - * isn't accepted by any binutils version so far released, - * complaining that the same type of prefix is used twice if - * an extended register is needed for addressing (fix submitted - * to mainline 2005-11-21). - * - * asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave)); - * - * This, however, we can work around by forcing the compiler to - * select an addressing mode that doesn't require extended - * registers. - */ - asm volatile( "rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); - } -} - -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact. - */ -static inline int fpu_save_init(struct fpu *fpu) -{ - if (use_xsave()) { - fpu_xsave(fpu); - - /* - * xsave header may indicate the init state of the FP. - */ - if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return 1; - } else if (use_fxsr()) { - fpu_fxsave(fpu); - } else { - asm volatile("fnsave %[fx]; fwait" - : [fx] "=m" (fpu->state->fsave)); - return 0; - } - - /* - * If exceptions are pending, we need to clear them so - * that we don't randomly get exceptions later. - * - * FIXME! Is this perhaps only true for the old-style - * irq13 case? Maybe we could leave the x87 state - * intact otherwise? - */ - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { - asm volatile("fnclex"); - return 0; - } - return 1; -} - -static inline int __save_init_fpu(struct task_struct *tsk) -{ - return fpu_save_init(&tsk->thread.fpu); -} - -static inline int fpu_restore_checking(struct fpu *fpu) -{ - if (use_xsave()) - return fpu_xrstor_checking(&fpu->state->xsave); - else if (use_fxsr()) - return fxrstor_checking(&fpu->state->fxsave); - else - return frstor_checking(&fpu->state->fsave); -} - -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - /* - * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is - * pending. Clear the x87 state here by setting it to fixed values. - * "m" is a random variable that should be in L1. - */ - if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { - asm volatile( - "fnclex\n\t" - "emms\n\t" - "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (tsk->thread.fpu.has_fpu)); - } - - return fpu_restore_checking(&tsk->thread.fpu); -} - -/* - * Software FPU state helpers. Careful: these need to - * be preemption protection *and* they need to be - * properly paired with the CR0.TS changes! - */ -static inline int __thread_has_fpu(struct task_struct *tsk) -{ - return tsk->thread.fpu.has_fpu; -} - -/* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 0; - this_cpu_write(fpu_owner_task, NULL); -} - -/* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 1; - this_cpu_write(fpu_owner_task, tsk); -} - -/* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void __thread_fpu_end(struct task_struct *tsk) -{ - __thread_clear_has_fpu(tsk); - if (!use_eager_fpu()) - stts(); -} - -static inline void __thread_fpu_begin(struct task_struct *tsk) -{ - if (!use_eager_fpu()) - clts(); - __thread_set_has_fpu(tsk); -} - -static inline void drop_fpu(struct task_struct *tsk) -{ - /* - * Forget coprocessor state.. - */ - preempt_disable(); - tsk->thread.fpu_counter = 0; - - if (__thread_has_fpu(tsk)) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(tsk); - } - - clear_stopped_child_used_math(tsk); - preempt_enable(); -} - -static inline void restore_init_xstate(void) -{ - if (use_xsave()) - xrstor_state(init_xstate_buf, -1); - else - fxrstor_checking(&init_xstate_buf->i387); -} - -/* - * Reset the FPU state in the eager case and drop it in the lazy case (later use - * will reinit it). - */ -static inline void fpu_reset_state(struct task_struct *tsk) -{ - if (!use_eager_fpu()) - drop_fpu(tsk); - else - restore_init_xstate(); -} - -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. - * - * - switch_fpu_finish() restores the new state as - * necessary. - */ -typedef struct { int preload; } fpu_switch_t; - -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) -{ - fpu_switch_t fpu; - - /* - * If the task has used the math, pre-load the FPU on xsave processors - * or if the past 5 consecutive context-switches used math. - */ - fpu.preload = tsk_used_math(new) && - (use_eager_fpu() || new->thread.fpu_counter > 5); - - if (__thread_has_fpu(old)) { - if (!__save_init_fpu(old)) - task_disable_lazy_fpu_restore(old); - else - old->thread.fpu.last_cpu = cpu; - - /* But leave fpu_owner_task! */ - old->thread.fpu.has_fpu = 0; - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new->thread.fpu_counter++; - __thread_set_has_fpu(new); - prefetch(new->thread.fpu.state); - } else if (!use_eager_fpu()) - stts(); - } else { - old->thread.fpu_counter = 0; - task_disable_lazy_fpu_restore(old); - if (fpu.preload) { - new->thread.fpu_counter++; - if (fpu_lazy_restore(new, cpu)) - fpu.preload = 0; - else - prefetch(new->thread.fpu.state); - __thread_fpu_begin(new); - } - } - return fpu; -} - -/* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. - */ -static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) -{ - if (fpu.preload) { - if (unlikely(restore_fpu_checking(new))) - fpu_reset_state(new); - } -} - -/* - * Signal frame handlers... - */ -extern int save_xstate_sig(void __user *buf, void __user *fx, int size); -extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); - -static inline int xstate_sigframe_size(void) -{ - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; -} - -static inline int restore_xstate_sig(void __user *buf, int ia32_frame) -{ - void __user *buf_fx = buf; - int size = xstate_sigframe_size(); - - if (ia32_frame && use_fxsr()) { - buf_fx = buf + sizeof(struct i387_fsave_struct); - size += sizeof(struct i387_fsave_struct); - } - - return __restore_xstate_sig(buf, buf_fx, size); -} - -/* - * Needs to be preemption-safe. - * - * NOTE! user_fpu_begin() must be used only immediately before restoring - * the save state. It does not do any saving/restoring on its own. In - * lazy FPU mode, it is just an optimization to avoid a #NM exception, - * the task can lose the FPU right after preempt_enable(). - */ -static inline void user_fpu_begin(void) -{ - preempt_disable(); - if (!user_has_fpu()) - __thread_fpu_begin(current); - preempt_enable(); -} - -static inline void __save_fpu(struct task_struct *tsk) -{ - if (use_xsave()) { - if (unlikely(system_state == SYSTEM_BOOTING)) - xsave_state_booting(&tsk->thread.fpu.state->xsave, -1); - else - xsave_state(&tsk->thread.fpu.state->xsave, -1); - } else - fpu_fxsave(&tsk->thread.fpu); -} - -/* - * i387 state interaction - */ -static inline unsigned short get_fpu_cwd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.cwd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.swd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) -{ - if (cpu_has_xmm) { - return tsk->thread.fpu.state->fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - -static bool fpu_allocated(struct fpu *fpu) -{ - return fpu->state != NULL; -} - -static inline int fpu_alloc(struct fpu *fpu) -{ - if (fpu_allocated(fpu)) - return 0; - fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); - if (!fpu->state) - return -ENOMEM; - WARN_ON((unsigned long)fpu->state & 15); - return 0; -} - -static inline void fpu_free(struct fpu *fpu) -{ - if (fpu->state) { - kmem_cache_free(task_xstate_cachep, fpu->state); - fpu->state = NULL; - } -} - -static inline void fpu_copy(struct task_struct *dst, struct task_struct *src) -{ - if (use_eager_fpu()) { - memset(&dst->thread.fpu.state->xsave, 0, xstate_size); - __save_fpu(dst); - } else { - struct fpu *dfpu = &dst->thread.fpu; - struct fpu *sfpu = &src->thread.fpu; - - unlazy_fpu(src); - memcpy(dfpu->state, sfpu->state, xstate_size); - } -} - -static inline unsigned long -alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, - unsigned long *size) -{ - unsigned long frame_size = xstate_sigframe_size(); - - *buf_fx = sp = round_down(sp - frame_size, 64); - if (ia32_frame && use_fxsr()) { - frame_size += sizeof(struct i387_fsave_struct); - sp -= sizeof(struct i387_fsave_struct); - } - - *size = frame_size; - return sp; -} - -#endif diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h new file mode 100644 index 000000000000..1429a7c736db --- /dev/null +++ b/arch/x86/include/asm/fpu/api.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _ASM_X86_FPU_API_H +#define _ASM_X86_FPU_API_H + +/* + * Careful: __kernel_fpu_begin/end() must be called with preempt disabled + * and they don't touch the preempt state on their own. + * If you enable preemption after __kernel_fpu_begin(), preempt notifier + * should call the __kernel_fpu_end() to prevent the kernel/user FPU + * state from getting corrupted. KVM for example uses this model. + * + * All other cases use kernel_fpu_begin/end() which disable preemption + * during kernel FPU usage. + */ +extern void __kernel_fpu_begin(void); +extern void __kernel_fpu_end(void); +extern void kernel_fpu_begin(void); +extern void kernel_fpu_end(void); +extern bool irq_fpu_usable(void); + +/* + * Some instructions like VIA's padlock instructions generate a spurious + * DNA fault but don't modify SSE registers. And these instructions + * get used from interrupt context as well. To prevent these kernel instructions + * in interrupt context interacting wrongly with other user/kernel fpu usage, we + * should use them only in the context of irq_ts_save/restore() + */ +extern int irq_ts_save(void); +extern void irq_ts_restore(int TS_state); + +/* + * Query the presence of one or more xfeatures. Works on any legacy CPU as well. + * + * If 'feature_name' is set then put a human-readable description of + * the feature there as well - this can be used to print error (or success) + * messages. + */ +extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); + +#endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h new file mode 100644 index 000000000000..99690bed920a --- /dev/null +++ b/arch/x86/include/asm/fpu/internal.h @@ -0,0 +1,689 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _ASM_X86_FPU_INTERNAL_H +#define _ASM_X86_FPU_INTERNAL_H + +#include <linux/compat.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/user.h> +#include <asm/fpu/api.h> +#include <asm/fpu/xstate.h> + +/* + * High level FPU state handling functions: + */ +extern void fpu__activate_curr(struct fpu *fpu); +extern void fpu__activate_stopped(struct fpu *fpu); +extern void fpu__save(struct fpu *fpu); +extern void fpu__restore(struct fpu *fpu); +extern int fpu__restore_sig(void __user *buf, int ia32_frame); +extern void fpu__drop(struct fpu *fpu); +extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); +extern void fpu__clear(struct fpu *fpu); +extern int fpu__exception_code(struct fpu *fpu, int trap_nr); +extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); + +/* + * Boot time FPU initialization functions: + */ +extern void fpu__init_cpu(void); +extern void fpu__init_system_xstate(void); +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system(struct cpuinfo_x86 *c); +extern void fpu__init_check_bugs(void); +extern void fpu__resume_cpu(void); + +/* + * Debugging facility: + */ +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ 0; }) +#endif + +/* + * FPU related CPU feature flag helper routines: + */ +static __always_inline __pure bool use_eager_fpu(void) +{ + return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); +} + +static __always_inline __pure bool use_xsaveopt(void) +{ + return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); +} + +static __always_inline __pure bool use_xsave(void) +{ + return static_cpu_has_safe(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return static_cpu_has_safe(X86_FEATURE_FXSR); +} + +/* + * fpstate handling functions: + */ + +extern union fpregs_state init_fpstate; + +extern void fpstate_init(union fpregs_state *state); +#ifdef CONFIG_MATH_EMULATION +extern void fpstate_init_soft(struct swregs_state *soft); +#else +static inline void fpstate_init_soft(struct swregs_state *soft) {} +#endif +static inline void fpstate_init_fxstate(struct fxregs_state *fx) +{ + fx->cwd = 0x37f; + fx->mxcsr = MXCSR_DEFAULT; +} +extern void fpstate_sanitize_xstate(struct fpu *fpu); + +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile(ASM_STAC "\n" \ + "1:" #insn "\n\t" \ + "2: " ASM_CLAC "\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define check_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +static inline int copy_fregs_to_user(struct fregs_state __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + + /* See comment in copy_fxregs_to_kernel() below. */ + return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); +} + +static inline int copy_kernel_to_fxregs(struct fxregs_state *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + + /* See comment in copy_fxregs_to_kernel() below. */ + return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); +} + +static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + + /* See comment in copy_fxregs_to_kernel() below. */ + return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); +} + +static inline int copy_kernel_to_fregs(struct fregs_state *fx) +{ + return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int copy_user_to_fregs(struct fregs_state __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void copy_fxregs_to_kernel(struct fpu *fpu) +{ + if (config_enabled(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); + else { + /* Using "rex64; fxsave %0" is broken because, if the memory + * operand uses any extended registers for addressing, a second + * REX prefix will be generated (to the assembler, rex64 + * followed by semicolon is a separate instruction), and hence + * the 64-bitness is lost. + * + * Using "fxsaveq %0" would be the ideal choice, but is only + * supported starting with gas 2.16. + * + * Using, as a workaround, the properly prefixed form below + * isn't accepted by any binutils version so far released, + * complaining that the same type of prefix is used twice if + * an extended register is needed for addressing (fix submitted + * to mainline 2005-11-21). + * + * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave)); + * + * This, however, we can work around by forcing the compiler to + * select an addressing mode that doesn't require extended + * registers. + */ + asm volatile( "rex64/fxsave (%[fx])" + : "=m" (fpu->state.fxsave) + : [fx] "R" (&fpu->state.fxsave)); + } +} + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* xstate instruction fault handler: */ +#define xstate_fault(__err) \ + \ + ".section .fixup,\"ax\"\n" \ + \ + "3: movl $-2,%[_err]\n" \ + " jmp 2b\n" \ + \ + ".previous\n" \ + \ + _ASM_EXTABLE(1b, 3b) \ + : [_err] "=r" (__err) + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XSAVES"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); + else + asm volatile("1:"XSAVE"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XRSTORS"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); + else + asm volatile("1:"XRSTOR"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); + + /* We should never fault when copying from a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Save processor xstate to xsave area. + */ +static inline void copy_xregs_to_kernel(struct xregs_state *xstate) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(!alternatives_patched); + + /* + * If xsaves is enabled, xsaves replaces xsaveopt because + * it supports compact format and supervisor states in addition to + * modified optimization in xsaveopt. + * + * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave + * because xsaveopt supports modified optimization which is not + * supported by xsave. + * + * If none of xsaves and xsaveopt is enabled, use xsave. + */ + alternative_input_2( + "1:"XSAVE, + XSAVEOPT, + X86_FEATURE_XSAVEOPT, + XSAVES, + X86_FEATURE_XSAVES, + [xstate] "D" (xstate), "a" (lmask), "d" (hmask) : + "memory"); + asm volatile("2:\n\t" + xstate_fault(err) + : "0" (err) + : "memory"); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Restore processor xstate from xsave area. + */ +static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + /* + * Use xrstors to restore context if it is enabled. xrstors supports + * compacted format of xsave area which is not supported by xrstor. + */ + alternative_input( + "1: " XRSTOR, + XRSTORS, + X86_FEATURE_XSAVES, + "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile("2:\n" + xstate_fault(err) + : "0" (err) + : "memory"); + + /* We should never fault when copying from a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for + * backward compatibility for old applications which don't understand + * compacted format of xsave area. + */ +static inline int copy_xregs_to_user(struct xregs_state __user *buf) +{ + int err; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + err = __clear_user(&buf->header, sizeof(buf->header)); + if (unlikely(err)) + return -EFAULT; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XSAVE"\n" + "2: " ASM_CLAC "\n" + xstate_fault(err) + : "D" (buf), "a" (-1), "d" (-1), "0" (err) + : "memory"); + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) +{ + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XRSTOR"\n" + "2: " ASM_CLAC "\n" + xstate_fault(err) + : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); /* memory required? */ + return err; +} + +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. + */ +static inline int copy_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + copy_xregs_to_kernel(&fpu->state.xsave); + return 1; + } + + if (likely(use_fxsr())) { + copy_fxregs_to_kernel(fpu); + return 1; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to mark them inactive: + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); + + return 0; +} + +static inline int __copy_fpstate_to_fpregs(struct fpu *fpu) +{ + if (use_xsave()) { + copy_kernel_to_xregs(&fpu->state.xsave, -1); + return 0; + } else { + if (use_fxsr()) + return copy_kernel_to_fxregs(&fpu->state.fxsave); + else + return copy_kernel_to_fregs(&fpu->state.fsave); + } +} + +static inline int copy_fpstate_to_fpregs(struct fpu *fpu) +{ + /* + * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is + * pending. Clear the x87 state here by setting it to fixed values. + * "m" is a random variable that should be in L1. + */ + if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %P[addr]" /* set F?P to defined value */ + : : [addr] "m" (fpu->fpregs_active)); + } + + return __copy_fpstate_to_fpregs(fpu); +} + +extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); + +/* + * FPU context switch related helper methods: + */ + +DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +/* + * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, + * on this CPU. + * + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. + */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; +} + +static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + + +/* + * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' + * idiom, which is then paired with the sw-flag (fpregs_active) later on: + */ + +static inline void __fpregs_activate_hw(void) +{ + if (!use_eager_fpu()) + clts(); +} + +static inline void __fpregs_deactivate_hw(void) +{ + if (!use_eager_fpu()) + stts(); +} + +/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ +static inline void __fpregs_deactivate(struct fpu *fpu) +{ + WARN_ON_FPU(!fpu->fpregs_active); + + fpu->fpregs_active = 0; + this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ +static inline void __fpregs_activate(struct fpu *fpu) +{ + WARN_ON_FPU(fpu->fpregs_active); + + fpu->fpregs_active = 1; + this_cpu_write(fpu_fpregs_owner_ctx, fpu); +} + +/* + * The question "does this thread have fpu access?" + * is slightly racy, since preemption could come in + * and revoke it immediately after the test. + * + * However, even in that very unlikely scenario, + * we can just assume we have FPU access - typically + * to save the FP state - we'll just take a #NM + * fault and get the FPU access back. + */ +static inline int fpregs_active(void) +{ + return current->thread.fpu.fpregs_active; +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void fpregs_activate(struct fpu *fpu) +{ + __fpregs_activate_hw(); + __fpregs_activate(fpu); +} + +static inline void fpregs_deactivate(struct fpu *fpu) +{ + __fpregs_deactivate(fpu); + __fpregs_deactivate_hw(); +} + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. + */ +typedef struct { int preload; } fpu_switch_t; + +static inline fpu_switch_t +switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) +{ + fpu_switch_t fpu; + + /* + * If the task has used the math, pre-load the FPU on xsave processors + * or if the past 5 consecutive context-switches used math. + */ + fpu.preload = new_fpu->fpstate_active && + (use_eager_fpu() || new_fpu->counter > 5); + + if (old_fpu->fpregs_active) { + if (!copy_fpregs_to_fpstate(old_fpu)) + old_fpu->last_cpu = -1; + else + old_fpu->last_cpu = cpu; + + /* But leave fpu_fpregs_owner_ctx! */ + old_fpu->fpregs_active = 0; + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + new_fpu->counter++; + __fpregs_activate(new_fpu); + prefetch(&new_fpu->state); + } else { + __fpregs_deactivate_hw(); + } + } else { + old_fpu->counter = 0; + old_fpu->last_cpu = -1; + if (fpu.preload) { + new_fpu->counter++; + if (fpu_want_lazy_restore(new_fpu, cpu)) + fpu.preload = 0; + else + prefetch(&new_fpu->state); + fpregs_activate(new_fpu); + } + } + return fpu; +} + +/* + * Misc helper functions: + */ + +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) +{ + if (fpu_switch.preload) { + if (unlikely(copy_fpstate_to_fpregs(new_fpu))) { + WARN_ON_FPU(1); + fpu__clear(new_fpu); + } + } +} + +/* + * Needs to be preemption-safe. + * + * NOTE! user_fpu_begin() must be used only immediately before restoring + * the save state. It does not do any saving/restoring on its own. In + * lazy FPU mode, it is just an optimization to avoid a #NM exception, + * the task can lose the FPU right after preempt_enable(). + */ +static inline void user_fpu_begin(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + preempt_disable(); + if (!fpregs_active()) + fpregs_activate(fpu); + preempt_enable(); +} + +/* + * MXCSR and XCR definitions: + */ + +extern unsigned int mxcsr_feature_mask; + +#define XCR_XFEATURE_ENABLED_MASK 0x00000000 + +static inline u64 xgetbv(u32 index) +{ + u32 eax, edx; + + asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ + : "=a" (eax), "=d" (edx) + : "c" (index)); + return eax + ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ + u32 eax = value; + u32 edx = value >> 32; + + asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ + : : "a" (eax), "d" (edx), "c" (index)); +} + +#endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h new file mode 100644 index 000000000000..39d3107ac6c7 --- /dev/null +++ b/arch/x86/include/asm/fpu/regset.h @@ -0,0 +1,21 @@ +/* + * FPU regset handling methods: + */ +#ifndef _ASM_X86_FPU_REGSET_H +#define _ASM_X86_FPU_REGSET_H + +#include <linux/regset.h> + +extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == regset_fpregs_active. Please refer to the comment + * at the definition of regset_fpregs_active. + */ +#define xstateregs_active regset_fpregs_active + +#endif /* _ASM_X86_FPU_REGSET_H */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h new file mode 100644 index 000000000000..7358e9d61f1e --- /dev/null +++ b/arch/x86/include/asm/fpu/signal.h @@ -0,0 +1,33 @@ +/* + * x86 FPU signal frame handling methods: + */ +#ifndef _ASM_X86_FPU_SIGNAL_H +#define _ASM_X86_FPU_SIGNAL_H + +#ifdef CONFIG_X86_64 +# include <asm/sigcontext32.h> +# include <asm/user32.h> +struct ksignal; +int ia32_setup_rt_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs); +#else +# define user_i387_ia32_struct user_i387_struct +# define user32_fxsr_struct user_fxsr_struct +# define ia32_setup_frame __setup_frame +# define ia32_setup_rt_frame __setup_rt_frame +#endif + +extern void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk); +extern void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env); + +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size); + +extern void fpu__init_prepare_fx_sw_frame(void); + +#endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h new file mode 100644 index 000000000000..0637826292de --- /dev/null +++ b/arch/x86/include/asm/fpu/types.h @@ -0,0 +1,293 @@ +/* + * FPU data structures: + */ +#ifndef _ASM_X86_FPU_H +#define _ASM_X86_FPU_H + +/* + * The legacy x87 FPU state format, as saved by FSAVE and + * restored by the FRSTOR instructions: + */ +struct fregs_state { + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE]: */ + u32 status; +}; + +/* + * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and + * restored by the FXRSTOR instructions. It's similar to the FSAVE + * format, but differs in some areas, plus has extensions at + * the end for the XMM registers. + */ +struct fxregs_state { + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ + union { + struct { + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ + }; + struct { + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ + }; + }; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; + +} __attribute__((aligned(16))); + +/* Default value for fxregs_state.mxcsr: */ +#define MXCSR_DEFAULT 0x1f80 + +/* + * Software based FPU emulation state. This is arbitrary really, + * it matches the x87 format to make it easier to understand: + */ +struct swregs_state { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct math_emu_info *info; + u32 entry_eip; +}; + +/* + * List of XSAVE features Linux knows about: + */ +enum xfeature_bit { + XSTATE_BIT_FP, + XSTATE_BIT_SSE, + XSTATE_BIT_YMM, + XSTATE_BIT_BNDREGS, + XSTATE_BIT_BNDCSR, + XSTATE_BIT_OPMASK, + XSTATE_BIT_ZMM_Hi256, + XSTATE_BIT_Hi16_ZMM, + + XFEATURES_NR_MAX, +}; + +#define XSTATE_FP (1 << XSTATE_BIT_FP) +#define XSTATE_SSE (1 << XSTATE_BIT_SSE) +#define XSTATE_YMM (1 << XSTATE_BIT_YMM) +#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS) +#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR) +#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK) +#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256) +#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM) + +#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) + +/* + * There are 16x 256-bit AVX registers named YMM0-YMM15. + * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) + * and are stored in 'struct fxregs_state::xmm_space[]'. + * + * The high 128 bits are stored here: + * 16x 128 bits == 256 bytes. + */ +struct ymmh_struct { + u8 ymmh_space[256]; +}; + +/* We don't support LWP yet: */ +struct lwp_struct { + u8 reserved[128]; +}; + +/* Intel MPX support: */ +struct bndreg { + u64 lower_bound; + u64 upper_bound; +} __packed; + +struct bndcsr { + u64 bndcfgu; + u64 bndstatus; +} __packed; + +struct mpx_struct { + struct bndreg bndreg[4]; + struct bndcsr bndcsr; +}; + +struct xstate_header { + u64 xfeatures; + u64 xcomp_bv; + u64 reserved[6]; +} __attribute__((packed)); + +/* New processor state extensions should be added here: */ +#define XSTATE_RESERVE (sizeof(struct ymmh_struct) + \ + sizeof(struct lwp_struct) + \ + sizeof(struct mpx_struct) ) +/* + * This is our most modern FPU state format, as saved by the XSAVE + * and restored by the XRSTOR instructions. + * + * It consists of a legacy fxregs portion, an xstate header and + * subsequent fixed size areas as defined by the xstate header. + * Not all CPUs support all the extensions. + */ +struct xregs_state { + struct fxregs_state i387; + struct xstate_header header; + u8 __reserved[XSTATE_RESERVE]; +} __attribute__ ((packed, aligned (64))); + +/* + * This is a union of all the possible FPU state formats + * put together, so that we can pick the right one runtime. + * + * The size of the structure is determined by the largest + * member - which is the xsave area: + */ +union fpregs_state { + struct fregs_state fsave; + struct fxregs_state fxsave; + struct swregs_state soft; + struct xregs_state xsave; +}; + +/* + * Highest level per task FPU state data structure that + * contains the FPU register state plus various FPU + * state fields: + */ +struct fpu { + /* + * @state: + * + * In-memory copy of all FPU registers that we save/restore + * over context switches. If the task is using the FPU then + * the registers in the FPU are more recent than this state + * copy. If the task context-switches away then they get + * saved here and represent the FPU state. + * + * After context switches there may be a (short) time period + * during which the in-FPU hardware registers are unchanged + * and still perfectly match this state, if the tasks + * scheduled afterwards are not using the FPU. + * + * This is the 'lazy restore' window of optimization, which + * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. + * + * We detect whether a subsequent task uses the FPU via setting + * CR0::TS to 1, which causes any FPU use to raise a #NM fault. + * + * During this window, if the task gets scheduled again, we + * might be able to skip having to do a restore from this + * memory buffer to the hardware registers - at the cost of + * incurring the overhead of #NM fault traps. + * + * Note that on modern CPUs that support the XSAVEOPT (or other + * optimized XSAVE instructions), we don't use #NM traps anymore, + * as the hardware can track whether FPU registers need saving + * or not. On such CPUs we activate the non-lazy ('eagerfpu') + * logic, which unconditionally saves/restores all FPU state + * across context switches. (if FPU state exists.) + */ + union fpregs_state state; + + /* + * @last_cpu: + * + * Records the last CPU on which this context was loaded into + * FPU registers. (In the lazy-restore case we might be + * able to reuse FPU registers across multiple context switches + * this way, if no intermediate task used the FPU.) + * + * A value of -1 is used to indicate that the FPU state in context + * memory is newer than the FPU state in registers, and that the + * FPU state should be reloaded next time the task is run. + */ + unsigned int last_cpu; + + /* + * @fpstate_active: + * + * This flag indicates whether this context is active: if the task + * is not running then we can restore from this context, if the task + * is running then we should save into this context. + */ + unsigned char fpstate_active; + + /* + * @fpregs_active: + * + * This flag determines whether a given context is actively + * loaded into the FPU's registers and that those registers + * represent the task's current FPU state. + * + * Note the interaction with fpstate_active: + * + * # task does not use the FPU: + * fpstate_active == 0 + * + * # task uses the FPU and regs are active: + * fpstate_active == 1 && fpregs_active == 1 + * + * # the regs are inactive but still match fpstate: + * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu + * + * The third state is what we use for the lazy restore optimization + * on lazy-switching CPUs. + */ + unsigned char fpregs_active; + + /* + * @counter: + * + * This counter contains the number of consecutive context switches + * during which the FPU stays used. If this is over a threshold, the + * lazy FPU restore logic becomes eager, to save the trap overhead. + * This is an unsigned char so that after 256 iterations the counter + * wraps and the context switch behavior turns lazy again; this is to + * deal with bursty apps that only use the FPU for a short time: + */ + unsigned char counter; +}; + +#endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h new file mode 100644 index 000000000000..339894669117 --- /dev/null +++ b/arch/x86/include/asm/fpu/xstate.h @@ -0,0 +1,45 @@ +#ifndef __ASM_X86_XSAVE_H +#define __ASM_X86_XSAVE_H + +#include <linux/types.h> +#include <asm/processor.h> +#include <linux/uaccess.h> + +/* Bit 63 of XCR0 is reserved for future expansion */ +#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) + +#define XSTATE_CPUID 0x0000000d + +#define FXSAVE_SIZE 512 + +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) + +/* Supported features which support lazy state saving */ +#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) + +/* Supported features which require eager state saving */ +#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) + +/* All currently supported features */ +#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +extern unsigned int xstate_size; +extern u64 xfeatures_mask; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); + +void *get_xsave_addr(struct xregs_state *xsave, int xstate); + +#endif diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 0f5fb6b6567e..7178043b0e1d 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,6 +14,7 @@ typedef struct { #endif #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; + unsigned int kvm_posted_intr_wakeup_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; @@ -33,6 +34,9 @@ typedef struct { #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif +#ifdef CONFIG_X86_MCE_AMD + unsigned int irq_deferred_error_count; +#endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 36f7125945e3..5fa9fb0f8809 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -74,20 +74,16 @@ extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); struct irq_data; +struct hpet_dev; +struct irq_domain; + extern void hpet_msi_unmask(struct irq_data *data); extern void hpet_msi_mask(struct irq_data *data); -struct hpet_dev; extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); - -#ifdef CONFIG_PCI_MSI -extern int default_setup_hpet_msi(unsigned int irq, unsigned int id); -#else -static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - return -EINVAL; -} -#endif +extern struct irq_domain *hpet_create_irq_domain(int hpet_id); +extern int hpet_assign_irq(struct irq_domain *domain, + struct hpet_dev *dev, int dev_num); #ifdef CONFIG_HPET_EMULATE_RTC diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e9571ddabc4f..6615032e19c8 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,6 +29,7 @@ extern asmlinkage void apic_timer_interrupt(void); extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); @@ -36,43 +37,10 @@ extern asmlinkage void spurious_interrupt(void); extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); -extern asmlinkage void invalidate_interrupt(void); -extern asmlinkage void invalidate_interrupt0(void); -extern asmlinkage void invalidate_interrupt1(void); -extern asmlinkage void invalidate_interrupt2(void); -extern asmlinkage void invalidate_interrupt3(void); -extern asmlinkage void invalidate_interrupt4(void); -extern asmlinkage void invalidate_interrupt5(void); -extern asmlinkage void invalidate_interrupt6(void); -extern asmlinkage void invalidate_interrupt7(void); -extern asmlinkage void invalidate_interrupt8(void); -extern asmlinkage void invalidate_interrupt9(void); -extern asmlinkage void invalidate_interrupt10(void); -extern asmlinkage void invalidate_interrupt11(void); -extern asmlinkage void invalidate_interrupt12(void); -extern asmlinkage void invalidate_interrupt13(void); -extern asmlinkage void invalidate_interrupt14(void); -extern asmlinkage void invalidate_interrupt15(void); -extern asmlinkage void invalidate_interrupt16(void); -extern asmlinkage void invalidate_interrupt17(void); -extern asmlinkage void invalidate_interrupt18(void); -extern asmlinkage void invalidate_interrupt19(void); -extern asmlinkage void invalidate_interrupt20(void); -extern asmlinkage void invalidate_interrupt21(void); -extern asmlinkage void invalidate_interrupt22(void); -extern asmlinkage void invalidate_interrupt23(void); -extern asmlinkage void invalidate_interrupt24(void); -extern asmlinkage void invalidate_interrupt25(void); -extern asmlinkage void invalidate_interrupt26(void); -extern asmlinkage void invalidate_interrupt27(void); -extern asmlinkage void invalidate_interrupt28(void); -extern asmlinkage void invalidate_interrupt29(void); -extern asmlinkage void invalidate_interrupt30(void); -extern asmlinkage void invalidate_interrupt31(void); - extern asmlinkage void irq_move_cleanup_interrupt(void); extern asmlinkage void reboot_interrupt(void); extern asmlinkage void threshold_interrupt(void); +extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); @@ -87,60 +55,93 @@ extern void trace_spurious_interrupt(void); extern void trace_thermal_interrupt(void); extern void trace_reschedule_interrupt(void); extern void trace_threshold_interrupt(void); +extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt #define trace_reboot_interrupt reboot_interrupt #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi +#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi #endif /* CONFIG_TRACING */ -#ifdef CONFIG_IRQ_REMAP -/* Intel specific interrupt remapping information */ -struct irq_2_iommu { - struct intel_iommu *iommu; - u16 irte_index; - u16 sub_handle; - u8 irte_mask; -}; - -/* AMD specific interrupt remapping information */ -struct irq_2_irte { - u16 devid; /* Device ID for IRTE table */ - u16 index; /* Index into IRTE table*/ -}; -#endif /* CONFIG_IRQ_REMAP */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; +struct pci_dev; +struct msi_desc; + +enum irq_alloc_type { + X86_IRQ_ALLOC_TYPE_IOAPIC = 1, + X86_IRQ_ALLOC_TYPE_HPET, + X86_IRQ_ALLOC_TYPE_MSI, + X86_IRQ_ALLOC_TYPE_MSIX, + X86_IRQ_ALLOC_TYPE_DMAR, + X86_IRQ_ALLOC_TYPE_UV, +}; -struct irq_cfg { - cpumask_var_t domain; - cpumask_var_t old_domain; - u8 vector; - u8 move_in_progress : 1; -#ifdef CONFIG_IRQ_REMAP - u8 remapped : 1; +struct irq_alloc_info { + enum irq_alloc_type type; + u32 flags; + const struct cpumask *mask; /* CPU mask for vector allocation */ union { - struct irq_2_iommu irq_2_iommu; - struct irq_2_irte irq_2_irte; - }; + int unused; +#ifdef CONFIG_HPET_TIMER + struct { + int hpet_id; + int hpet_index; + void *hpet_data; + }; #endif - union { -#ifdef CONFIG_X86_IO_APIC +#ifdef CONFIG_PCI_MSI struct { - struct list_head irq_2_pin; + struct pci_dev *msi_dev; + irq_hw_number_t msi_hwirq; + }; +#endif +#ifdef CONFIG_X86_IO_APIC + struct { + int ioapic_id; + int ioapic_pin; + int ioapic_node; + u32 ioapic_trigger : 1; + u32 ioapic_polarity : 1; + u32 ioapic_valid : 1; + struct IO_APIC_route_entry *ioapic_entry; + }; +#endif +#ifdef CONFIG_DMAR_TABLE + struct { + int dmar_id; + void *dmar_data; + }; +#endif +#ifdef CONFIG_HT_IRQ + struct { + int ht_pos; + int ht_idx; + struct pci_dev *ht_dev; + void *ht_update; + }; +#endif +#ifdef CONFIG_X86_UV + struct { + int uv_limit; + int uv_blade; + unsigned long uv_offset; + char *uv_name; }; #endif }; }; +struct irq_cfg { + unsigned int dest_apicid; + u8 vector; +}; + extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); -extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); -extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); -extern void clear_irq_vector(int irq, struct irq_cfg *cfg); extern void setup_vector_irq(int cpu); #ifdef CONFIG_SMP extern void send_cleanup_vector(struct irq_cfg *); @@ -150,10 +151,7 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif -extern int apic_retrigger_irq(struct irq_data *data); extern void apic_ack_edge(struct irq_data *data); -extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id); #else /* CONFIG_X86_LOCAL_APIC */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} @@ -163,8 +161,7 @@ static inline void unlock_vector_lock(void) {} extern atomic_t irq_err_count; extern atomic_t irq_mis_count; -/* EISA */ -extern void eisa_set_level_irq(unsigned int irq); +extern void elcr_set_level_irq(unsigned int irq); /* SMP */ extern __visible void smp_apic_timer_interrupt(struct pt_regs *); @@ -178,7 +175,6 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void); extern __visible void smp_reschedule_interrupt(struct pt_regs *); extern __visible void smp_call_function_interrupt(struct pt_regs *); extern __visible void smp_call_function_single_interrupt(struct pt_regs *); -extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif extern char irq_entries_start[]; diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h deleted file mode 100644 index 6eb6fcb83f63..000000000000 --- a/arch/x86/include/asm/i387.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _ASM_X86_I387_H -#define _ASM_X86_I387_H - -#ifndef __ASSEMBLY__ - -#include <linux/sched.h> -#include <linux/hardirq.h> - -struct pt_regs; -struct user_i387_struct; - -extern int init_fpu(struct task_struct *child); -extern void fpu_finit(struct fpu *fpu); -extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); -extern void math_state_restore(void); - -extern bool irq_fpu_usable(void); - -/* - * Careful: __kernel_fpu_begin/end() must be called with preempt disabled - * and they don't touch the preempt state on their own. - * If you enable preemption after __kernel_fpu_begin(), preempt notifier - * should call the __kernel_fpu_end() to prevent the kernel/user FPU - * state from getting corrupted. KVM for example uses this model. - * - * All other cases use kernel_fpu_begin/end() which disable preemption - * during kernel FPU usage. - */ -extern void __kernel_fpu_begin(void); -extern void __kernel_fpu_end(void); - -static inline void kernel_fpu_begin(void) -{ - preempt_disable(); - WARN_ON_ONCE(!irq_fpu_usable()); - __kernel_fpu_begin(); -} - -static inline void kernel_fpu_end(void) -{ - __kernel_fpu_end(); - preempt_enable(); -} - -/* Must be called with preempt disabled */ -extern void kernel_fpu_disable(void); -extern void kernel_fpu_enable(void); - -/* - * Some instructions like VIA's padlock instructions generate a spurious - * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context as well. To prevent these kernel instructions - * in interrupt context interacting wrongly with other user/kernel fpu usage, we - * should use them only in the context of irq_ts_save/restore() - */ -static inline int irq_ts_save(void) -{ - /* - * If in process context and not atomic, we can take a spurious DNA fault. - * Otherwise, doing clts() in process context requires disabling preemption - * or some heavy lifting like kernel_fpu_begin() - */ - if (!in_atomic()) - return 0; - - if (read_cr0() & X86_CR0_TS) { - clts(); - return 1; - } - - return 0; -} - -static inline void irq_ts_restore(int TS_state) -{ - if (TS_state) - stts(); -} - -/* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int user_has_fpu(void) -{ - return current->thread.fpu.has_fpu; -} - -extern void unlazy_fpu(struct task_struct *tsk); - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 34a5b93704d3..4afc05ffa566 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -177,6 +177,7 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * look at pci_iomap(). */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 2f91685fe1cd..6cbf2cfb3f8a 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -95,9 +95,22 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 +struct irq_alloc_info; +struct ioapic_domain_cfg; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#define IOAPIC_MASKED 1 +#define IOAPIC_UNMASKED 0 + +#define IOAPIC_POL_HIGH 0 +#define IOAPIC_POL_LOW 1 + +#define IOAPIC_DEST_MODE_PHYSICAL 0 +#define IOAPIC_DEST_MODE_LOGICAL 1 + #define IOAPIC_MAP_ALLOC 0x1 #define IOAPIC_MAP_CHECK 0x2 @@ -110,9 +123,6 @@ extern int nr_ioapics; extern int mpc_ioapic_id(int ioapic); extern unsigned int mpc_ioapic_addr(int ioapic); -extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic); - -#define MP_MAX_IOAPIC_PIN 127 /* # of MP IRQ source entries */ extern int mp_irq_entries; @@ -120,9 +130,6 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* Older SiS APIC requires we rewrite the index register */ -extern int sis_apic_bug; - /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; @@ -132,6 +139,8 @@ extern int noioapicquirk; /* -1 if "noapic" boot option passed */ extern int noioapicreroute; +extern u32 gsi_top; + extern unsigned long io_apic_irqs; #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs)) @@ -147,13 +156,6 @@ struct irq_cfg; extern void ioapic_insert_resources(void); extern int arch_early_ioapic_init(void); -extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); -extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); - -extern void native_eoi_ioapic_pin(int apic, int pin, int vector); - extern int save_ioapic_entries(void); extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); @@ -161,82 +163,32 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); -struct io_apic_irq_attr { - int ioapic; - int ioapic_pin; - int trigger; - int polarity; -}; - -enum ioapic_domain_type { - IOAPIC_DOMAIN_INVALID, - IOAPIC_DOMAIN_LEGACY, - IOAPIC_DOMAIN_STRICT, - IOAPIC_DOMAIN_DYNAMIC, -}; - -struct device_node; -struct irq_domain; -struct irq_domain_ops; - -struct ioapic_domain_cfg { - enum ioapic_domain_type type; - const struct irq_domain_ops *ops; - struct device_node *dev; -}; - -struct mp_ioapic_gsi{ - u32 gsi_base; - u32 gsi_end; -}; -extern u32 gsi_top; - extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); -extern u32 mp_pin_to_gsi(int ioapic, int pin); -extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags); +extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info); extern void mp_unmap_irq(int irq); extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); extern int mp_unregister_ioapic(u32 gsi_base); extern int mp_ioapic_registered(u32 gsi_base); -extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq); -extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); -extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); -extern void __init pre_init_apic_IRQ0(void); + +extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, + int node, int trigger, int polarity); extern void mp_save_irq(struct mpc_intsrc *m); extern void disable_ioapic_support(void); -extern void __init native_io_apic_init_mappings(void); +extern void __init io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); -extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); -extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); -extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); -extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); -extern int native_ioapic_set_affinity(struct irq_data *, - const struct cpumask *, - bool); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { return x86_io_apic_ops.read(apic, reg); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.write(apic, reg, value); -} -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.modify(apic, reg, value); -} - -extern void io_apic_eoi(unsigned int apic, unsigned int vector); - extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); @@ -253,8 +205,12 @@ static inline int arch_early_ioapic_init(void) { return 0; } static inline void print_IO_APICs(void) {} #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } -static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } -static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; } +static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info) +{ + return gsi; +} + static inline void mp_unmap_irq(int irq) { } static inline int save_ioapic_entries(void) @@ -268,17 +224,11 @@ static inline int restore_ioapic_entries(void) return -ENOMEM; } -static inline void mp_save_irq(struct mpc_intsrc *m) { }; +static inline void mp_save_irq(struct mpc_intsrc *m) { } static inline void disable_ioapic_support(void) { } -#define native_io_apic_init_mappings NULL +static inline void io_apic_init_mappings(void) { } #define native_io_apic_read NULL -#define native_io_apic_write NULL -#define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_io_apic_print_entries NULL -#define native_ioapic_set_affinity NULL -#define native_setup_ioapic_entry NULL -#define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } static inline void enable_IO_APIC(void) { } diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index a80cbb88ea91..8008d06581c7 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -30,6 +30,10 @@ extern void fixup_irqs(void); extern void irq_force_complete_move(int); #endif +#ifdef CONFIG_HAVE_KVM +extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); +#endif + extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 6224d316c405..78974fbc33b4 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -22,14 +22,12 @@ #ifndef __X86_IRQ_REMAPPING_H #define __X86_IRQ_REMAPPING_H +#include <asm/irqdomain.h> +#include <asm/hw_irq.h> #include <asm/io_apic.h> -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_chip; struct msi_msg; -struct pci_dev; -struct irq_cfg; +struct irq_alloc_info; #ifdef CONFIG_IRQ_REMAP @@ -39,22 +37,21 @@ extern int irq_remapping_enable(void); extern void irq_remapping_disable(void); extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); -extern int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr); -extern void free_remapped_irq(int irq); -extern void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id); -extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); extern void panic_if_irq_remap(const char *msg); -extern bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip); -void irq_remap_modify_chip_defaults(struct irq_chip *chip); +extern struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info); +extern struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info); + +/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ +extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent); + +/* Get parent irqdomain for interrupt remapping irqdomain */ +static inline struct irq_domain *arch_get_ir_parent_domain(void) +{ + return x86_vector_domain; +} #else /* CONFIG_IRQ_REMAP */ @@ -64,42 +61,22 @@ static inline int irq_remapping_enable(void) { return -ENODEV; } static inline void irq_remapping_disable(void) { } static inline int irq_remapping_reenable(int eim) { return -ENODEV; } static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } -static inline int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr) -{ - return -ENODEV; -} -static inline void free_remapped_irq(int irq) { } -static inline void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ -} -static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - return -ENODEV; -} static inline void panic_if_irq_remap(const char *msg) { } -static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) +static inline struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info) { + return NULL; } -static inline bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip) +static inline struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info) { - return false; + return NULL; } -#endif /* CONFIG_IRQ_REMAP */ - -#define dmar_alloc_hwirq() irq_alloc_hwirq(-1) -#define dmar_free_hwirq irq_free_hwirq +#endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 666c89ec4bd7..4c2d2eb2060a 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -47,31 +47,12 @@ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR #define IA32_SYSCALL_VECTOR 0x80 -#ifdef CONFIG_X86_32 -# define SYSCALL_VECTOR 0x80 -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. * round up to the next 16-vector boundary */ -#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) - -#define IRQ1_VECTOR (IRQ0_VECTOR + 1) -#define IRQ2_VECTOR (IRQ0_VECTOR + 2) -#define IRQ3_VECTOR (IRQ0_VECTOR + 3) -#define IRQ4_VECTOR (IRQ0_VECTOR + 4) -#define IRQ5_VECTOR (IRQ0_VECTOR + 5) -#define IRQ6_VECTOR (IRQ0_VECTOR + 6) -#define IRQ7_VECTOR (IRQ0_VECTOR + 7) -#define IRQ8_VECTOR (IRQ0_VECTOR + 8) -#define IRQ9_VECTOR (IRQ0_VECTOR + 9) -#define IRQ10_VECTOR (IRQ0_VECTOR + 10) -#define IRQ11_VECTOR (IRQ0_VECTOR + 11) -#define IRQ12_VECTOR (IRQ0_VECTOR + 12) -#define IRQ13_VECTOR (IRQ0_VECTOR + 13) -#define IRQ14_VECTOR (IRQ0_VECTOR + 14) -#define IRQ15_VECTOR (IRQ0_VECTOR + 15) +#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) /* * Special IRQ vectors used by the SMP architecture, 0xf0-0xff @@ -102,21 +83,23 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 -/* Vector for KVM to deliver posted interrupt IPI */ -#ifdef CONFIG_HAVE_KVM -#define POSTED_INTR_VECTOR 0xf2 -#endif - +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 /* * IRQ work vector: */ #define IRQ_WORK_VECTOR 0xf6 #define UV_BAU_MESSAGE 0xf5 +#define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 +/* Vector for KVM to deliver posted interrupt IPI */ +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * Local APIC timer IRQ vector is on a different priority level, * to work around the 'lost local interrupt if more than 2 IRQ @@ -155,18 +138,22 @@ static inline int invalid_vm86_irq(int irq) * static arrays. */ -#define NR_IRQS_LEGACY 16 +#define NR_IRQS_LEGACY 16 -#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) +#define CPU_VECTOR_LIMIT (64 * NR_CPUS) +#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS) -#ifdef CONFIG_X86_IO_APIC -# define CPU_VECTOR_LIMIT (64 * NR_CPUS) -# define NR_IRQS \ +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI) +#define NR_IRQS \ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) -#else /* !CONFIG_X86_IO_APIC: */ -# define NR_IRQS NR_IRQS_LEGACY +#elif defined(CONFIG_X86_IO_APIC) +#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) +#elif defined(CONFIG_PCI_MSI) +#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT) +#else +#define NR_IRQS NR_IRQS_LEGACY #endif #endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h new file mode 100644 index 000000000000..d26075b52885 --- /dev/null +++ b/arch/x86/include/asm/irqdomain.h @@ -0,0 +1,63 @@ +#ifndef _ASM_IRQDOMAIN_H +#define _ASM_IRQDOMAIN_H + +#include <linux/irqdomain.h> +#include <asm/hw_irq.h> + +#ifdef CONFIG_X86_LOCAL_APIC +enum { + /* Allocate contiguous CPU vectors */ + X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, +}; + +extern struct irq_domain *x86_vector_domain; + +extern void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask); +extern void copy_irq_alloc_info(struct irq_alloc_info *dst, + struct irq_alloc_info *src); +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +struct device_node; +struct irq_data; + +enum ioapic_domain_type { + IOAPIC_DOMAIN_INVALID, + IOAPIC_DOMAIN_LEGACY, + IOAPIC_DOMAIN_STRICT, + IOAPIC_DOMAIN_DYNAMIC, +}; + +struct ioapic_domain_cfg { + enum ioapic_domain_type type; + const struct irq_domain_ops *ops; + struct device_node *dev; +}; + +extern const struct irq_domain_ops mp_ioapic_irqdomain_ops; + +extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg); +extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs); +extern void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data); +extern void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data); +extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); +#endif /* CONFIG_X86_IO_APIC */ + +#ifdef CONFIG_PCI_MSI +extern void arch_init_msi_domain(struct irq_domain *domain); +#else +static inline void arch_init_msi_domain(struct irq_domain *domain) { } +#endif + +#ifdef CONFIG_HT_IRQ +extern void arch_init_htirq_domain(struct irq_domain *domain); +#else +static inline void arch_init_htirq_domain(struct irq_domain *domain) { } +#endif + +#endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f4a555beef19..f8c0ec3a4a97 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1002,8 +1002,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -int fx_init(struct kvm_vcpu *vcpu); - void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 1f5a86d518db..6a3034a0a072 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -117,8 +117,19 @@ struct mca_config { }; struct mce_vendor_flags { - __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ - __reserved_0 : 63; + /* + * overflow recovery cpuid bit indicates that overflow + * conditions are not fatal + */ + __u64 overflow_recov : 1, + + /* + * SUCCOR stands for S/W UnCorrectable error COntainment + * and Recovery. It indicates support for data poisoning + * in HW and deferred error interrupts. + */ + succor : 1, + __reserved_0 : 62; }; extern struct mce_vendor_flags mce_flags; @@ -223,6 +234,9 @@ void do_machine_check(struct pt_regs *, long); extern void (*mce_threshold_vector)(void); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* Deferred error interrupt handler */ +extern void (*deferred_error_int_vector)(void); + /* * Thermal handler */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 2fb20d6f7e23..9e6278c7140e 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H +#include <linux/earlycpio.h> + #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = native_read_msr((msr)); \ @@ -152,6 +154,7 @@ extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); void reload_early_microcode(void); +extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); #else static inline void __init load_ucode_bsp(void) {} static inline void load_ucode_ap(void) {} @@ -160,6 +163,9 @@ static inline int __init save_microcode_in_initrd(void) return 0; } static inline void reload_early_microcode(void) {} +static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) +{ + return false; +} #endif - #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index af935397e053..b8438543f340 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -65,12 +65,12 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; #ifdef CONFIG_MICROCODE_AMD_EARLY -extern void __init load_ucode_amd_bsp(void); +extern void __init load_ucode_amd_bsp(int family); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); void reload_ucode_amd(void); #else -static inline void __init load_ucode_amd_bsp(void) {} +static inline void __init load_ucode_amd_bsp(int family) {} static inline void load_ucode_amd_ap(void) {} static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } void reload_ucode_amd(void) {} diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 2b9209c46ca9..7991c606125d 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -51,20 +51,11 @@ struct extended_sigtable { (((struct microcode_intel *)mc)->hdr.datasize ? \ ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc); +extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); extern int microcode_sanity_check(void *mc, int print_err); -extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc); - -static inline int -revision_is_newer(struct microcode_header_intel *mc_header, int rev) -{ - return (mc_header->rev <= rev) ? 0 : 1; -} +extern int find_matching_signature(void *mc, unsigned int csig, int cpf); #ifdef CONFIG_MICROCODE_INTEL_EARLY extern void __init load_ucode_intel_bsp(void); diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index a952a13d59a7..f3c1b71d4fae 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -60,8 +60,8 @@ #ifdef CONFIG_X86_INTEL_MPX siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xsave_struct *xsave_buf); -int mpx_handle_bd_fault(struct xsave_struct *xsave_buf); + struct xregs_state *xsave_buf); +int mpx_handle_bd_fault(struct xregs_state *xsave_buf); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); @@ -78,11 +78,11 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xsave_struct *xsave_buf) + struct xregs_state *xsave_buf) { return NULL; } -static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +static inline int mpx_handle_bd_fault(struct xregs_state *xsave_buf) { return -EINVAL; } diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h new file mode 100644 index 000000000000..93724cc62177 --- /dev/null +++ b/arch/x86/include/asm/msi.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_MSI_H +#define _ASM_X86_MSI_H +#include <asm/hw_irq.h> + +typedef struct irq_alloc_info msi_alloc_info_t; + +#endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 8957810ad7d1..d143bfad45d7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#ifdef CONFIG_QUEUED_SPINLOCKS + +static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, + u32 val) +{ + PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val); +} + +static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) +{ + PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock); +} + +static __always_inline void pv_wait(u8 *ptr, u8 val) +{ + PVOP_VCALL2(pv_lock_ops.wait, ptr, val); +} + +static __always_inline void pv_kick(int cpu) +{ + PVOP_VCALL1(pv_lock_ops.kick, cpu); +} + +#else /* !CONFIG_QUEUED_SPINLOCKS */ + static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { @@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } -#endif +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +#endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index f7b0b5c112f2..a6b8f9fadb06 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -160,13 +160,14 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); unsigned long long (*read_tscp)(unsigned int *aux); +#ifdef CONFIG_X86_32 /* * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) + * is only used in 32-bit kernels. 64-bit kernels use + * usergs_sysret32 instead. */ void (*irq_enable_sysexit)(void); +#endif /* * Switch to usermode gs and return to 64-bit usermode using @@ -333,9 +334,19 @@ struct arch_spinlock; typedef u16 __ticket_t; #endif +struct qspinlock; + struct pv_lock_ops { +#ifdef CONFIG_QUEUED_SPINLOCKS + void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); + struct paravirt_callee_save queued_spin_unlock; + + void (*wait)(u8 *ptr, u8 val); + void (*kick)(int cpu); +#else /* !CONFIG_QUEUED_SPINLOCKS */ struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); +#endif /* !CONFIG_QUEUED_SPINLOCKS */ }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 81da003df21b..e894fade26fe 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -96,15 +96,10 @@ extern void pci_iommu_alloc(void); #ifdef CONFIG_PCI_MSI /* implemented in arch/x86/kernel/apic/io_apic. */ struct msi_desc; -void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, u8 hpet_id); int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev); -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset); #else -#define native_compose_msi_msg NULL #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 23ba6765b718..8e04f51d6bea 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -21,6 +21,7 @@ struct mm_struct; #include <asm/desc_defs.h> #include <asm/nops.h> #include <asm/special_insns.h> +#include <asm/fpu/types.h> #include <linux/personality.h> #include <linux/cpumask.h> @@ -52,11 +53,16 @@ static inline void *current_text_addr(void) return pc; } +/* + * These alignment constraints are for performance in the vSMP case, + * but in the task_struct case we must also meet hardware imposed + * alignment requirements of the FPU state: + */ #ifdef CONFIG_X86_VSMP # define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) # define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) #else -# define ARCH_MIN_TASKALIGN 16 +# define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state) # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif @@ -166,7 +172,6 @@ extern const struct seq_operations cpuinfo_op; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); -extern void fpu_detect(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_boot_cpu(void); @@ -313,128 +318,6 @@ struct orig_ist { unsigned long ist[7]; }; -#define MXCSR_DEFAULT 0x1f80 - -struct i387_fsave_struct { - u32 cwd; /* FPU Control Word */ - u32 swd; /* FPU Status Word */ - u32 twd; /* FPU Tag Word */ - u32 fip; /* FPU IP Offset */ - u32 fcs; /* FPU IP Selector */ - u32 foo; /* FPU Operand Pointer Offset */ - u32 fos; /* FPU Operand Pointer Selector */ - - /* 8*10 bytes for each FP-reg = 80 bytes: */ - u32 st_space[20]; - - /* Software status information [not touched by FSAVE ]: */ - u32 status; -}; - -struct i387_fxsave_struct { - u16 cwd; /* Control Word */ - u16 swd; /* Status Word */ - u16 twd; /* Tag Word */ - u16 fop; /* Last Instruction Opcode */ - union { - struct { - u64 rip; /* Instruction Pointer */ - u64 rdp; /* Data Pointer */ - }; - struct { - u32 fip; /* FPU IP Offset */ - u32 fcs; /* FPU IP Selector */ - u32 foo; /* FPU Operand Offset */ - u32 fos; /* FPU Operand Selector */ - }; - }; - u32 mxcsr; /* MXCSR Register State */ - u32 mxcsr_mask; /* MXCSR Mask */ - - /* 8*16 bytes for each FP-reg = 128 bytes: */ - u32 st_space[32]; - - /* 16*16 bytes for each XMM-reg = 256 bytes: */ - u32 xmm_space[64]; - - u32 padding[12]; - - union { - u32 padding1[12]; - u32 sw_reserved[12]; - }; - -} __attribute__((aligned(16))); - -struct i387_soft_struct { - u32 cwd; - u32 swd; - u32 twd; - u32 fip; - u32 fcs; - u32 foo; - u32 fos; - /* 8*10 bytes for each FP-reg = 80 bytes: */ - u32 st_space[20]; - u8 ftop; - u8 changed; - u8 lookahead; - u8 no_update; - u8 rm; - u8 alimit; - struct math_emu_info *info; - u32 entry_eip; -}; - -struct ymmh_struct { - /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ - u32 ymmh_space[64]; -}; - -/* We don't support LWP yet: */ -struct lwp_struct { - u8 reserved[128]; -}; - -struct bndreg { - u64 lower_bound; - u64 upper_bound; -} __packed; - -struct bndcsr { - u64 bndcfgu; - u64 bndstatus; -} __packed; - -struct xsave_hdr_struct { - u64 xstate_bv; - u64 xcomp_bv; - u64 reserved[6]; -} __attribute__((packed)); - -struct xsave_struct { - struct i387_fxsave_struct i387; - struct xsave_hdr_struct xsave_hdr; - struct ymmh_struct ymmh; - struct lwp_struct lwp; - struct bndreg bndreg[4]; - struct bndcsr bndcsr; - /* new processor state extensions will go here */ -} __attribute__ ((packed, aligned (64))); - -union thread_xstate { - struct i387_fsave_struct fsave; - struct i387_fxsave_struct fxsave; - struct i387_soft_struct soft; - struct xsave_struct xsave; -}; - -struct fpu { - unsigned int last_cpu; - unsigned int has_fpu; - union thread_xstate *state; -}; - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(struct orig_ist, orig_ist); @@ -483,8 +366,6 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack); #endif /* X86_64 */ extern unsigned int xstate_size; -extern void free_thread_xstate(struct task_struct *); -extern struct kmem_cache *task_xstate_cachep; struct perf_event; @@ -508,6 +389,10 @@ struct thread_struct { unsigned long fs; #endif unsigned long gs; + + /* Floating point and extended processor state */ + struct fpu fpu; + /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ @@ -518,8 +403,6 @@ struct thread_struct { unsigned long cr2; unsigned long trap_nr; unsigned long error_code; - /* floating point and extended processor state */ - struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ struct vm86_struct __user *vm86_info; @@ -535,15 +418,6 @@ struct thread_struct { unsigned long iopl; /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; - /* - * fpu_counter contains the number of consecutive context switches - * that the FPU is used. If this is over a threshold, the lazy fpu - * saving becomes unlazy to save the trap. This is an unsigned char - * so that after 256 times the counter wraps and the behavior turns - * lazy again; this to deal with bursty apps that only use FPU for - * a short time - */ - unsigned char fpu_counter; }; /* diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h new file mode 100644 index 000000000000..9d51fae1cba3 --- /dev/null +++ b/arch/x86/include/asm/qspinlock.h @@ -0,0 +1,57 @@ +#ifndef _ASM_X86_QSPINLOCK_H +#define _ASM_X86_QSPINLOCK_H + +#include <asm/cpufeature.h> +#include <asm-generic/qspinlock_types.h> +#include <asm/paravirt.h> + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + * + * A smp_store_release() on the least-significant byte. + */ +static inline void native_queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release((u8 *)lock, 0); +} + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); + +static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + pv_queued_spin_lock_slowpath(lock, val); +} + +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + pv_queued_spin_unlock(lock); +} +#else +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} +#endif + +#define virt_queued_spin_lock virt_queued_spin_lock + +static inline bool virt_queued_spin_lock(struct qspinlock *lock) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0) + cpu_relax(); + + return true; +} + +#include <asm-generic/qspinlock.h> + +#endif /* _ASM_X86_QSPINLOCK_H */ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h new file mode 100644 index 000000000000..b002e711ba88 --- /dev/null +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -0,0 +1,6 @@ +#ifndef __ASM_QSPINLOCK_PARAVIRT_H +#define __ASM_QSPINLOCK_PARAVIRT_H + +PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); + +#endif diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5a9856eb12ba..7d5a1929d76b 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -231,11 +231,21 @@ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ + +/* + * early_idt_handler_array is an array of entry points referenced in the + * early IDT. For simplicity, it's a real array with one entry point + * every nine bytes. That leaves room for an optional 'push $0' if the + * vector has no error code (two bytes), a 'push $vector_number' (two + * bytes), and a jump to the common entry code (up to five bytes). + */ +#define EARLY_IDT_HANDLER_SIZE 9 + #ifndef __ASSEMBLY__ -extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; +extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; #ifdef CONFIG_TRACING -# define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handler_array early_idt_handler_array #endif /* diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h index ee80b92f0096..6c8a7ed13365 100644 --- a/arch/x86/include/asm/simd.h +++ b/arch/x86/include/asm/simd.h @@ -1,5 +1,5 @@ -#include <asm/i387.h> +#include <asm/fpu/api.h> /* * may_use_simd - whether it is allowable at this time to issue SIMD diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aeb4666e0c0a..2270e41b32fd 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } +/** + * pcommit_sfence() - persistent commit and fence + * + * The PCOMMIT instruction ensures that data that has been flushed from the + * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to + * memory and is durable on the DIMM. The primary use case for this is + * persistent memory. + * + * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT + * with appropriate fencing. + * + * Example: + * void flush_and_commit_buffer(void *vaddr, unsigned int size) + * { + * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + * void *vend = vaddr + size; + * void *p; + * + * for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + * p < vend; p += boot_cpu_data.x86_clflush_size) + * clwb(p); + * + * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes + * // MFENCE via mb() also works + * wmb(); + * + * // PCOMMIT and the required SFENCE for ordering + * pcommit_sfence(); + * } + * + * After this function completes the data pointed to by 'vaddr' has been + * accepted to memory and will be durable if the 'vaddr' points to persistent + * memory. + * + * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify + * things we include both the PCOMMIT and the required SFENCE in the + * alternatives generated by pcommit_sfence(). + */ static inline void pcommit_sfence(void) { alternative(ASM_NOP7, diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 64b611782ef0..be0a05913b91 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -42,6 +42,10 @@ extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#else + #ifdef CONFIG_PARAVIRT_SPINLOCKS static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) @@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) cpu_relax(); } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ /* * Read-write spinlocks, allowing multiple readers diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 5f9d7572d82b..65c3e37f879a 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -23,6 +23,9 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm-generic/qspinlock_types.h> +#else typedef struct arch_spinlock { union { __ticketpair_t head_tail; @@ -33,6 +36,7 @@ typedef struct arch_spinlock { } arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ #include <asm-generic/qrwlock_types.h> diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 6a998598f172..c2e00bb2a136 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -39,7 +39,9 @@ #include <asm/processor.h> #include <asm/percpu.h> #include <asm/desc.h> + #include <linux/random.h> +#include <linux/sched.h> /* * 24 byte read-only segment initializer for stack canary. Linker diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 552d6c90a6d4..d1793f06854d 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -7,7 +7,7 @@ #define _ASM_X86_SUSPEND_32_H #include <asm/desc.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> /* image of the saved processor state */ struct saved_context { diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index bc6232834bab..7ebf0ebe4e68 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -7,7 +7,7 @@ #define _ASM_X86_SUSPEND_64_H #include <asm/desc.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> /* * Image of the saved processor state, used by the low level ACPI suspend to diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b4bdec3e9523..225ee545e1a0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -177,8 +177,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -DECLARE_PER_CPU(unsigned long, kernel_stack); - static inline struct thread_info *current_thread_info(void) { return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); @@ -197,9 +195,13 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ +#ifdef CONFIG_X86_64 +# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +#endif + /* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ _ASM_SUB $(THREAD_SIZE),reg ; /* diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 4cab890007a7..38a09a13a9bc 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -101,6 +101,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single); DEFINE_IRQ_VECTOR_EVENT(threshold_apic); /* + * deferred_error_apic - called when entering/exiting a deferred apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); + +/* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 4e49d7dff78e..c5380bea2a36 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); -asmlinkage void mce_threshold_interrupt(void); +asmlinkage void smp_threshold_interrupt(void); +asmlinkage void smp_deferred_error_interrupt(void); #endif extern enum ctx_state ist_enter(struct pt_regs *regs); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ace9dec050b1..a8df874f3e88 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -74,7 +74,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * @addr: User space pointer to start of block to check * @size: Size of block to check * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Checks if a pointer to a block of memory in user space is valid. * @@ -145,7 +146,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -240,7 +242,8 @@ extern void __put_user_8(void); * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger @@ -455,7 +458,8 @@ struct __large_struct { unsigned long buf[100]; }; * @x: Variable to store result. * @ptr: Source address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger @@ -479,7 +483,8 @@ struct __large_struct { unsigned long buf[100]; }; * @x: Value to copy to user space. * @ptr: Destination address, in user space. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 3c03a5de64d3..f5dcb5204dcd 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -59,6 +59,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret, 4); return ret; + case 8: + __put_user_size(*(u64 *)from, (u64 __user *)to, + 8, ret, 8); + return ret; } } return __copy_to_user_ll(to, from, n); @@ -70,7 +74,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. @@ -117,7 +122,8 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. Caller must check * the specified block with access_ok() before calling this function. diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index ccab4af1646d..59a54e869f15 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -14,8 +14,8 @@ struct user_ymmh_regs { __u32 ymmh_space[64]; }; -struct user_xsave_hdr { - __u64 xstate_bv; +struct user_xstate_header { + __u64 xfeatures; __u64 reserved1[2]; __u64 reserved2[5]; }; @@ -41,11 +41,11 @@ struct user_xsave_hdr { * particular process/thread. * * Also when the user modifies certain state FP/SSE/etc through the - * ptrace interface, they must ensure that the xsave_hdr.xstate_bv + * ptrace interface, they must ensure that the header.xfeatures * bytes[512..519] of the memory layout are updated correspondingly. * i.e., for example when FP state is modified to a non-init state, - * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to - * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc. + * header.xfeatures's bit 0 must be set to '1', when SSE is modified to + * non-init state, header.xfeatures's bit 1 must to be set to '1', etc. */ #define USER_XSTATE_FX_SW_WORDS 6 #define USER_XSTATE_XCR0_WORD 0 @@ -55,7 +55,7 @@ struct user_xstateregs { __u64 fpx_space[58]; __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS]; } i387; - struct user_xsave_hdr xsave_hdr; + struct user_xstate_header header; struct user_ymmh_regs ymmh; /* further processor state extensions go here */ }; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f58a9c7a3c86..48d34d28f5a6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -171,38 +171,17 @@ struct x86_platform_ops { }; struct pci_dev; -struct msi_msg; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); - void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, - u8 hpet_id); void (*teardown_msi_irq)(unsigned int irq); void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); - int (*setup_hpet_msi)(unsigned int irq, unsigned int id); }; -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_data; -struct cpumask; - struct x86_io_apic_ops { - void (*init) (void); unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*write) (unsigned int apic, unsigned int reg, unsigned int value); - void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - void (*print_entries)(unsigned int apic, unsigned int nr_entries); - int (*set_affinity)(struct irq_data *data, - const struct cpumask *mask, - bool force); - int (*setup_entry)(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr); - void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h deleted file mode 100644 index f2cba4e79a23..000000000000 --- a/arch/x86/include/asm/xcr.h +++ /dev/null @@ -1,49 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2008 rPath, Inc. - All Rights Reserved - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * asm-x86/xcr.h - * - * Definitions for the eXtended Control Register instructions - */ - -#ifndef _ASM_X86_XCR_H -#define _ASM_X86_XCR_H - -#define XCR_XFEATURE_ENABLED_MASK 0x00000000 - -#ifdef __KERNEL__ -# ifndef __ASSEMBLY__ - -#include <linux/types.h> - -static inline u64 xgetbv(u32 index) -{ - u32 eax, edx; - - asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ - : "=a" (eax), "=d" (edx) - : "c" (index)); - return eax + ((u64)edx << 32); -} - -static inline void xsetbv(u32 index, u64 value) -{ - u32 eax = value; - u32 edx = value >> 32; - - asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ - : : "a" (eax), "d" (edx), "c" (index)); -} - -# endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_XCR_H */ diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index d8829751b3f8..1f5c5161ead6 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -36,7 +36,7 @@ * no advantages to be gotten from x86-64 here anyways. */ -#include <asm/i387.h> +#include <asm/fpu/api.h> #ifdef CONFIG_X86_32 /* reduce register pressure */ diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index ce05722e3c68..5a08bc8bff33 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -26,7 +26,7 @@ #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" -#include <asm/i387.h> +#include <asm/fpu/api.h> static void xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 492b29802f57..7c0a517ec751 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -18,7 +18,7 @@ #ifdef CONFIG_AS_AVX #include <linux/compiler.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> #define BLOCK4(i) \ BLOCK(32 * i, 0) \ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h deleted file mode 100644 index c9a6d68b8d62..000000000000 --- a/arch/x86/include/asm/xsave.h +++ /dev/null @@ -1,257 +0,0 @@ -#ifndef __ASM_X86_XSAVE_H -#define __ASM_X86_XSAVE_H - -#include <linux/types.h> -#include <asm/processor.h> - -#define XSTATE_CPUID 0x0000000d - -#define XSTATE_FP 0x1 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 -#define XSTATE_BNDREGS 0x8 -#define XSTATE_BNDCSR 0x10 -#define XSTATE_OPMASK 0x20 -#define XSTATE_ZMM_Hi256 0x40 -#define XSTATE_Hi16_ZMM 0x80 - -#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) -#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) -/* Bit 63 of XCR0 is reserved for future expansion */ -#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) - -#define FXSAVE_SIZE 512 - -#define XSAVE_HDR_SIZE 64 -#define XSAVE_HDR_OFFSET FXSAVE_SIZE - -#define XSAVE_YMM_SIZE 256 -#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) - -/* Supported features which support lazy state saving */ -#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) - -/* Supported features which require eager state saving */ -#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) - -/* All currently supported features */ -#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) - -#ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " -#else -#define REX_PREFIX -#endif - -extern unsigned int xstate_size; -extern u64 pcntxt_mask; -extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern struct xsave_struct *init_xstate_buf; - -extern void xsave_init(void); -extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); -extern int init_fpu(struct task_struct *child); - -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" - -#define xstate_fault ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err) - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XSAVES"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XSAVE"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XRSTORS"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XRSTOR"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * Save processor xstate to xsave area. - */ -static inline int xsave_state(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - /* - * If xsaves is enabled, xsaves replaces xsaveopt because - * it supports compact format and supervisor states in addition to - * modified optimization in xsaveopt. - * - * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave - * because xsaveopt supports modified optimization which is not - * supported by xsave. - * - * If none of xsaves and xsaveopt is enabled, use xsave. - */ - alternative_input_2( - "1:"XSAVE, - XSAVEOPT, - X86_FEATURE_XSAVEOPT, - XSAVES, - X86_FEATURE_XSAVES, - [fx] "D" (fx), "a" (lmask), "d" (hmask) : - "memory"); - asm volatile("2:\n\t" - xstate_fault - : "0" (0) - : "memory"); - - return err; -} - -/* - * Restore processor xstate from xsave area. - */ -static inline int xrstor_state(struct xsave_struct *fx, u64 mask) -{ - int err = 0; - u32 lmask = mask; - u32 hmask = mask >> 32; - - /* - * Use xrstors to restore context if it is enabled. xrstors supports - * compacted format of xsave area which is not supported by xrstor. - */ - alternative_input( - "1: " XRSTOR, - XRSTORS, - X86_FEATURE_XSAVES, - "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - - asm volatile("2:\n" - xstate_fault - : "0" (0) - : "memory"); - - return err; -} - -/* - * Save xstate context for old process during context switch. - */ -static inline void fpu_xsave(struct fpu *fpu) -{ - xsave_state(&fpu->state->xsave, -1); -} - -/* - * Restore xstate context for new process during context switch. - */ -static inline int fpu_xrstor_checking(struct xsave_struct *fx) -{ - return xrstor_state(fx, -1); -} - -/* - * Save xstate to user space xsave area. - * - * We don't use modified optimization because xrstor/xrstors might track - * a different application. - * - * We don't use compacted format xsave area for - * backward compatibility for old applications which don't understand - * compacted format of xsave area. - */ -static inline int xsave_user(struct xsave_struct __user *buf) -{ - int err; - - /* - * Clear the xsave header first, so that reserved fields are - * initialized to zero. - */ - err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr)); - if (unlikely(err)) - return -EFAULT; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XSAVE"\n" - "2: " ASM_CLAC "\n" - xstate_fault - : "D" (buf), "a" (-1), "d" (-1), "0" (0) - : "memory"); - return err; -} - -/* - * Restore xstate from user space xsave area. - */ -static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) -{ - int err = 0; - struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); - u32 lmask = mask; - u32 hmask = mask >> 32; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XRSTOR"\n" - "2: " ASM_CLAC "\n" - xstate_fault - : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) - : "memory"); /* memory required? */ - return err; -} - -void *get_xsave_addr(struct xsave_struct *xsave, int xstate); -void setup_xstate_comp(void); - -#endif diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 16dc4e8a2cd3..0e8a973de9ee 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -25,7 +25,7 @@ struct _fpx_sw_bytes { __u32 extended_size; /* total size of the layout referred by * fpstate pointer in the sigcontext. */ - __u64 xstate_bv; + __u64 xfeatures; /* feature bit mask (including fp/sse/extended * state) that is present in the memory * layout. @@ -209,8 +209,8 @@ struct sigcontext { #endif /* !__i386__ */ -struct _xsave_hdr { - __u64 xstate_bv; +struct _header { + __u64 xfeatures; __u64 reserved1[2]; __u64 reserved2[5]; }; @@ -228,7 +228,7 @@ struct _ymmh_state { */ struct _xstate { struct _fpstate fpstate; - struct _xsave_hdr xstate_hdr; + struct _header xstate_hdr; struct _ymmh_state ymmh; /* new processor state extensions go here */ }; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9bcd0b56ca17..febaf180621b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,7 +44,7 @@ obj-y += pci-iommu_table.o obj-y += resource.o obj-y += process.o -obj-y += i387.o xsave.o +obj-y += fpu/ obj-y += ptrace.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index dbe76a14c3c9..e49ee24da85e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,12 +31,12 @@ #include <linux/module.h> #include <linux/dmi.h> #include <linux/irq.h> -#include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/pci.h> +#include <asm/irqdomain.h> #include <asm/pci_x86.h> #include <asm/pgtable.h> #include <asm/io_apic.h> @@ -400,57 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } -static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, - int polarity) -{ - int irq, node; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; - polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; - node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - if (mp_set_gsi_attr(gsi, trigger, polarity, node)) { - pr_warn("Failed to set pin attr for GSI%d\n", gsi); - return -1; - } - - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); - if (irq < 0) - return irq; - - /* Don't set up the ACPI SCI because it's already set up */ - if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi) - mp_config_acpi_gsi(dev, gsi, trigger, polarity); - - return irq; -} - -static void mp_unregister_gsi(u32 gsi) -{ - int irq; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return; - - irq = mp_map_gsi_to_irq(gsi, 0); - if (irq > 0) - mp_unmap_irq(irq); -} - -static struct irq_domain_ops acpi_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, -}; - static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic = (struct acpi_madt_io_apic *)header; @@ -652,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi, * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (trigger == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); + elcr_set_level_irq(gsi); #endif return gsi; @@ -663,10 +619,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int trigger, int polarity) { int irq = gsi; - #ifdef CONFIG_X86_IO_APIC + int node; + struct irq_alloc_info info; + + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + ioapic_set_alloc_attr(&info, node, trigger, polarity); + mutex_lock(&acpi_ioapic_lock); - irq = mp_register_gsi(dev, gsi, trigger, polarity); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + /* Don't set up the ACPI SCI because it's already set up */ + if (irq >= 0 && enable_update_mptable && + acpi_gbl_FADT.sci_interrupt != gsi) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); mutex_unlock(&acpi_ioapic_lock); #endif @@ -676,8 +643,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, static void acpi_unregister_gsi_ioapic(u32 gsi) { #ifdef CONFIG_X86_IO_APIC + int irq; + mutex_lock(&acpi_ioapic_lock); - mp_unregister_gsi(gsi); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); + if (irq > 0) + mp_unmap_irq(irq); mutex_unlock(&acpi_ioapic_lock); #endif } @@ -786,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) u64 addr; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr); diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index ae693b51ed8e..8c35df468104 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel) pushfq popq pt_regs_flags(%rax) - movq $resume_point, saved_rip(%rip) + movq $.Lresume_point, saved_rip(%rip) movq %rsp, saved_rsp movq %rbp, saved_rbp @@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel) xorl %eax, %eax call x86_acpi_enter_sleep_state /* in case something went wrong, restore the machine status and go on */ - jmp resume_point + jmp .Lresume_point .align 4 -resume_point: +.Lresume_point: /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index aef653193160..c42827eb86cf 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -21,6 +21,10 @@ #include <asm/io.h> #include <asm/fixmap.h> +int __read_mostly alternatives_patched; + +EXPORT_SYMBOL_GPL(alternatives_patched); + #define MAX_PATCH_LEN (255-1) static int __initdata_or_module debug_alternative; @@ -227,6 +231,15 @@ void __init arch_init_ideal_nops(void) #endif } break; + + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 > 0xf) { + ideal_nops = p6_nops; + return; + } + + /* fall through */ + default: #ifdef CONFIG_X86_64 ideal_nops = k8_nops; @@ -627,6 +640,7 @@ void __init alternative_instructions(void) apply_paravirt(__parainstructions, __parainstructions_end); restart_nmi(); + alternatives_patched = 1; } /** diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 5caed1dd7ccf..29fa475ec518 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -89,9 +89,7 @@ int amd_cache_northbridges(void) next_northbridge(link, amd_nb_link_ids); } - /* GART present only on Fam15h upto model 0fh */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) + if (amd_gart_present()) amd_northbridges.flags |= AMD_NB_GART; /* diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 6a7c23ff21d3..ede92c3364d3 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void) static void apbt_setup_irq(struct apbt_dev *adev) { - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 76164e173a24..6e85f713641d 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -262,6 +262,9 @@ void __init early_gart_iommu_check(void) u64 aper_base = 0, last_aper_base = 0; int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0; + if (!amd_gart_present()) + return; + if (!early_pci_allowed()) return; @@ -355,6 +358,9 @@ int __init gart_iommu_hole_init(void) int fix, slot, valid_agp = 0; int i, node; + if (!amd_gart_present()) + return -ENODEV; + if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return -ENODEV; @@ -452,7 +458,7 @@ out: force_iommu || valid_agp || fallback_aper_force) { - pr_info("Your BIOS doesn't leave a aperture memory hole\n"); + pr_info("Your BIOS doesn't leave an aperture memory hole\n"); pr_info("Please enable the IOMMU option in the BIOS setup\n"); pr_info("This costs you %dMB of RAM\n", 32 << fallback_aper_order); diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 816f36e979ad..ae50d3454d78 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu <jiang.liu@linux.intel.com> + * Add support of hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -14,78 +16,112 @@ #include <linux/device.h> #include <linux/pci.h> #include <linux/htirq.h> +#include <asm/irqdomain.h> #include <asm/hw_irq.h> #include <asm/apic.h> #include <asm/hypertransport.h> +static struct irq_domain *htirq_domain; + /* * Hypertransport interrupt support */ -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - static int ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; + struct irq_data *parent = data->parent_data; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - target_ht_irq(data->irq, dest, cfg->vector); - return IRQ_SET_MASK_OK_NOCOPY; + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(data); + + fetch_ht_irq_msg(data->irq, &msg); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | + HT_IRQ_LOW_DEST_ID_MASK); + msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); + write_ht_irq_msg(data->irq, &msg); + } + + return ret; } static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = ht_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - struct irq_cfg *cfg; - struct ht_irq_msg msg; - unsigned dest; - int err; + struct ht_irq_cfg *ht_cfg; + struct irq_alloc_info *info = arg; + struct pci_dev *dev; + irq_hw_number_t hwirq; + int ret; - if (disable_apic) - return -ENXIO; + if (nr_irqs > 1 || !info) + return -EINVAL; - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; + dev = info->ht_dev; + hwirq = (info->ht_idx & 0xFF) | + PCI_DEVID(dev->bus->number, dev->devfn) << 8 | + (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24; + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; + ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL); + if (!ht_cfg) + return -ENOMEM; - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(ht_cfg); + return ret; + } + + /* Initialize msg to a value that will never match the first write. */ + ht_cfg->msg.address_lo = 0xffffffff; + ht_cfg->msg.address_hi = 0xffffffff; + ht_cfg->dev = info->ht_dev; + ht_cfg->update = info->ht_update; + ht_cfg->pos = info->ht_pos; + ht_cfg->idx = 0x10 + (info->ht_idx * 2); + irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg, + handle_edge_irq, ht_cfg, "edge"); + + return 0; +} + +static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} +static void htirq_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(irq_data); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); msg.address_lo = HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | HT_IRQ_LOW_VECTOR(cfg->vector) | ((apic->irq_dest_mode == 0) ? HT_IRQ_LOW_DM_PHYSICAL : @@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) HT_IRQ_LOW_MT_FIXED : HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; + write_ht_irq_msg(irq_data->irq, &msg); +} - write_ht_irq_msg(irq, &msg); +static void htirq_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct ht_irq_msg msg; - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + memset(&msg, 0, sizeof(msg)); + write_ht_irq_msg(irq_data->irq, &msg); +} - dev_dbg(&dev->dev, "irq %d for HT\n", irq); +static const struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, +}; - return 0; +void arch_init_htirq_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; + + htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL); + if (!htirq_domain) + pr_warn("failed to initialize irqdomain for HTIRQ.\n"); + else + htirq_domain->parent = parent; +} + +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update) +{ + struct irq_alloc_info info; + + if (!htirq_domain) + return -ENOSYS; + + init_irq_alloc_info(&info, NULL); + info.ht_idx = idx; + info.ht_pos = pos; + info.ht_dev = dev; + info.ht_update = update; + + return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev), + &info); +} + +void arch_teardown_ht_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f4dc2462a1ac..845dc0df2002 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -18,6 +18,16 @@ * and Rolf G. Tews * for testing these extensively * Paul Diefenbaugh : Added full ACPI support + * + * Historical information which is worth to be preserved: + * + * - SiS APIC rmw bug: + * + * We used to have a workaround for a bug in SiS chips which + * required to rewrite the index register for a read-modify-write + * operation as the chip lost the index information which was + * setup for the read already. We cache the data now, so that + * workaround has been removed. */ #include <linux/mm.h> @@ -31,13 +41,13 @@ #include <linux/acpi.h> #include <linux/module.h> #include <linux/syscore_ops.h> -#include <linux/irqdomain.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/jiffies.h> /* time_after() */ #include <linux/slab.h> #include <linux/bootmem.h> +#include <asm/irqdomain.h> #include <asm/idle.h> #include <asm/io.h> #include <asm/smp.h> @@ -63,27 +73,31 @@ #define for_each_ioapic_pin(idx, pin) \ for_each_ioapic((idx)) \ for_each_pin((idx), (pin)) - #define for_each_irq_pin(entry, head) \ list_for_each_entry(entry, &head, list) -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; -struct mp_pin_info { +struct irq_pin_list { + struct list_head list; + int apic, pin; +}; + +struct mp_chip_data { + struct list_head irq_2_pin; + struct IO_APIC_route_entry entry; int trigger; int polarity; - int node; - int set; u32 count; + bool isa_irq; +}; + +struct mp_ioapic_gsi { + u32 gsi_base; + u32 gsi_end; }; static struct ioapic { @@ -101,7 +115,6 @@ static struct ioapic { struct mp_ioapic_gsi gsi_config; struct ioapic_domain_cfg irqdomain_cfg; struct irq_domain *irqdomain; - struct mp_pin_info *pin_info; struct resource *iomem_res; } ioapics[MAX_IO_APICS]; @@ -117,7 +130,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx) return ioapics[ioapic_idx].mp_config.apicaddr; } -struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) { return &ioapics[ioapic_idx].gsi_config; } @@ -129,11 +142,16 @@ static inline int mp_ioapic_pin_count(int ioapic) return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; } -u32 mp_pin_to_gsi(int ioapic, int pin) +static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; } +static inline bool mp_is_legacy_irq(int irq) +{ + return irq >= 0 && irq < nr_legacy_irqs(); +} + /* * Initialize all legacy IRQs and all pins on the first IOAPIC * if we have legacy interrupt controller. Kernel boot option "pirq=" @@ -144,12 +162,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq) if (!nr_legacy_irqs()) return 0; - return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs()); -} - -static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) -{ - return ioapics[ioapic_idx].pin_info + pin; + return ioapic == 0 || mp_is_legacy_irq(irq); } static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) @@ -216,16 +229,6 @@ void mp_save_irq(struct mpc_intsrc *m) panic("Max # of irq sources exceeded!!\n"); } -struct irq_pin_list { - struct list_head list; - int apic, pin; -}; - -static struct irq_pin_list *alloc_irq_pin_list(int node) -{ - return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); -} - static void alloc_ioapic_saved_registers(int idx) { size_t size; @@ -247,8 +250,7 @@ static void free_ioapic_saved_registers(int idx) int __init arch_early_ioapic_init(void) { - struct irq_cfg *cfg; - int i, node = cpu_to_node(0); + int i; if (!nr_legacy_irqs()) io_apic_irqs = ~0UL; @@ -256,16 +258,6 @@ int __init arch_early_ioapic_init(void) for_each_ioapic(i) alloc_ioapic_saved_registers(i); - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. - */ - for (i = 0; i < nr_legacy_irqs(); i++) { - cfg = alloc_irq_and_cfg_at(i, node); - cfg->vector = IRQ0_VECTOR + i; - cpumask_setall(cfg->domain); - } - return 0; } @@ -283,7 +275,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } -void io_apic_eoi(unsigned int apic, unsigned int vector) +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(vector, &io_apic->eoi); @@ -296,7 +288,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) return readl(&io_apic->data); } -void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static void io_apic_write(unsigned int apic, unsigned int reg, + unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -304,21 +297,6 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu writel(value, &io_apic->data); } -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -378,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_mask_entry(int apic, int pin) { unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; + union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); @@ -391,16 +369,17 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { struct irq_pin_list *entry; /* don't allow duplicates */ - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) if (entry->apic == apic && entry->pin == pin) return 0; - entry = alloc_irq_pin_list(node); + entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); if (!entry) { pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", node, apic, pin); @@ -408,16 +387,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi } entry->apic = apic; entry->pin = pin; + list_add_tail(&entry->list, &data->irq_2_pin); - list_add_tail(&entry->list, &cfg->irq_2_pin); return 0; } -static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) +static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) { struct irq_pin_list *tmp, *entry; - list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list) + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) if (entry->apic == apic && entry->pin == pin) { list_del(&entry->list); kfree(entry); @@ -425,22 +404,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) } } -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static void add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { - if (__add_pin_to_irq_node(cfg, node, apic, pin)) + if (__add_pin_to_irq_node(data, node, apic, pin)) panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, int oldapic, int oldpin, int newapic, int newpin) { struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -450,32 +430,26 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, } /* old apic/pin didn't exist, so just add new ones */ - add_pin_to_irq_node(cfg, node, newapic, newpin); -} - -static void __io_apic_modify_irq(struct irq_pin_list *entry, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) -{ - unsigned int reg, pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); + add_pin_to_irq_node(data, node, newapic, newpin); } -static void io_apic_modify_irq(struct irq_cfg *cfg, +static void io_apic_modify_irq(struct mp_chip_data *data, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { + union entry_union eu; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) - __io_apic_modify_irq(entry, mask_and, mask_or, final); + eu.entry = data->entry; + eu.w1 &= mask_and; + eu.w1 |= mask_or; + data->entry = eu.entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + if (final) + final(entry); + } } static void io_apic_sync(struct irq_pin_list *entry) @@ -490,39 +464,31 @@ static void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void mask_ioapic(struct irq_cfg *cfg) +static void mask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_ioapic_irq(struct irq_data *data) +static void __unmask_ioapic(struct mp_chip_data *data) { - mask_ioapic(irqd_cfg(data)); + io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); } -static void __unmask_ioapic(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); -} - -static void unmask_ioapic(struct irq_cfg *cfg) +static void unmask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - __unmask_ioapic(cfg); + __unmask_ioapic(data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_ioapic_irq(struct irq_data *data) -{ - unmask_ioapic(irqd_cfg(data)); -} - /* * IO-APIC versions below 0x20 don't support EOI register. * For the record, here is the information about various versions: @@ -539,7 +505,7 @@ static void unmask_ioapic_irq(struct irq_data *data) * Otherwise, we simulate the EOI message manually by changing the trigger * mode to edge and then back to level, with RTE being masked during this. */ -void native_eoi_ioapic_pin(int apic, int pin, int vector) +static void __eoi_ioapic_pin(int apic, int pin, int vector) { if (mpc_ioapic_ver(apic) >= 0x20) { io_apic_eoi(apic, vector); @@ -551,7 +517,7 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = 1; + entry1.mask = IOAPIC_MASKED; entry1.trigger = IOAPIC_EDGE; __ioapic_write_entry(apic, pin, entry1); @@ -563,15 +529,14 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector) } } -void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { - struct irq_pin_list *entry; unsigned long flags; + struct irq_pin_list *entry; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) - x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin, - cfg->vector); + for_each_irq_pin(entry, data->irq_2_pin) + __eoi_ioapic_pin(entry->apic, entry->pin, vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -588,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -602,13 +567,12 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (!entry.trigger) { + if (entry.trigger == IOAPIC_EDGE) { entry.trigger = IOAPIC_LEVEL; ioapic_write_entry(apic, pin, entry); } - raw_spin_lock_irqsave(&ioapic_lock, flags); - x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector); + __eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -706,8 +670,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); } } @@ -809,11 +773,11 @@ static int EISA_ELCR(unsigned int irq) #endif -/* ISA interrupts are always polarity zero edge triggered, +/* ISA interrupts are always active high edge triggered, * when listed as conforming in the MP table. */ -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) +#define default_ISA_trigger(idx) (IOAPIC_EDGE) +#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) /* EISA interrupts are always polarity zero and can be edge or level * trigger depending on the ELCR value. If an interrupt is listed as @@ -823,53 +787,55 @@ static int EISA_ELCR(unsigned int irq) #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) #define default_EISA_polarity(idx) default_ISA_polarity(idx) -/* PCI interrupts are always polarity one level triggered, +/* PCI interrupts are always active low level triggered, * when listed as conforming in the MP table. */ -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) +#define default_PCI_trigger(idx) (IOAPIC_LEVEL) +#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; - int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - polarity = 1; - break; - } + switch (mp_irqs[idx].irqflag & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + return default_ISA_polarity(idx); + else + return default_PCI_polarity(idx); + case 1: + return IOAPIC_POL_HIGH; + case 2: + pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_POL_LOW; + } +} + +#ifdef CONFIG_EISA +static int eisa_irq_trigger(int idx, int bus, int trigger) +{ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_PCI: + case MP_BUS_ISA: + return trigger; + case MP_BUS_EISA: + return default_EISA_trigger(idx); } - return polarity; + pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); + return IOAPIC_LEVEL; } +#else +static inline int eisa_irq_trigger(int idx, int bus, int trigger) +{ + return trigger; +} +#endif static int irq_trigger(int idx) { @@ -879,153 +845,227 @@ static int irq_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#ifdef CONFIG_EISA - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - default: - { - pr_warn("broken BIOS!!\n"); - trigger = 1; - break; - } - } + switch ((mp_irqs[idx].irqflag >> 2) & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent trigger mode */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); + /* Take EISA into account */ + return eisa_irq_trigger(idx, bus, trigger); + case 1: + return IOAPIC_EDGE; + case 2: + pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_LEVEL; + } +} + +void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, + int trigger, int polarity) +{ + init_irq_alloc_info(info, NULL); + info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info->ioapic_node = node; + info->ioapic_trigger = trigger; + info->ioapic_polarity = polarity; + info->ioapic_valid = 1; +} + +#ifndef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); #endif - break; - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - trigger = 0; - break; + +static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, + struct irq_alloc_info *src, + u32 gsi, int ioapic_idx, int pin) +{ + int trigger, polarity; + + copy_irq_alloc_info(dst, src); + dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + dst->ioapic_id = mpc_ioapic_id(ioapic_idx); + dst->ioapic_pin = pin; + dst->ioapic_valid = 1; + if (src && src->ioapic_valid) { + dst->ioapic_node = src->ioapic_node; + dst->ioapic_trigger = src->ioapic_trigger; + dst->ioapic_polarity = src->ioapic_polarity; + } else { + dst->ioapic_node = NUMA_NO_NODE; + if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { + dst->ioapic_trigger = trigger; + dst->ioapic_polarity = polarity; + } else { + /* + * PCI interrupts are always active low level + * triggered. + */ + dst->ioapic_trigger = IOAPIC_LEVEL; + dst->ioapic_polarity = IOAPIC_POL_LOW; } } - return trigger; } -static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) +static int ioapic_alloc_attr_node(struct irq_alloc_info *info) +{ + return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE; +} + +static void mp_register_handler(unsigned int irq, unsigned long trigger) +{ + irq_flow_handler_t hdl; + bool fasteoi; + + if (trigger) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); +} + +static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) { + struct mp_chip_data *data = irq_get_chip_data(irq); + + /* + * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger + * and polarity attirbutes. So allow the first user to reprogram the + * pin with real trigger and polarity attributes. + */ + if (irq < nr_legacy_irqs() && data->count == 1) { + if (info->ioapic_trigger != data->trigger) + mp_register_handler(irq, data->trigger); + data->entry.trigger = data->trigger = info->ioapic_trigger; + data->entry.polarity = data->polarity = info->ioapic_polarity; + } + + return data->trigger == info->ioapic_trigger && + data->polarity == info->ioapic_polarity; +} + +static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, + struct irq_alloc_info *info) +{ + bool legacy = false; int irq = -1; - int ioapic = (int)(long)domain->host_data; int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { case IOAPIC_DOMAIN_LEGACY: /* - * Dynamically allocate IRQ number for non-ISA IRQs in the first 16 - * GSIs on some weird platforms. + * Dynamically allocate IRQ number for non-ISA IRQs in the first + * 16 GSIs on some weird platforms. */ - if (gsi < nr_legacy_irqs()) - irq = irq_create_mapping(domain, pin); - else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + if (!ioapic_initialized || gsi >= nr_legacy_irqs()) irq = gsi; + legacy = mp_is_legacy_irq(irq); break; case IOAPIC_DOMAIN_STRICT: - if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) - irq = gsi; + irq = gsi; break; case IOAPIC_DOMAIN_DYNAMIC: - irq = irq_create_mapping(domain, pin); break; default: WARN(1, "ioapic: unknown irqdomain type %d\n", type); - break; + return -1; + } + + return __irq_domain_alloc_irqs(domain, irq, 1, + ioapic_alloc_attr_node(info), + info, legacy); +} + +/* + * Need special handling for ISA IRQs because there may be multiple IOAPIC pins + * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping + * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are + * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and + * some BIOSes may use MP Interrupt Source records to override IRQ numbers for + * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be + * multiple pins sharing the same legacy IRQ number when ACPI is disabled. + */ +static int alloc_isa_irq_from_domain(struct irq_domain *domain, + int irq, int ioapic, int pin, + struct irq_alloc_info *info) +{ + struct mp_chip_data *data; + struct irq_data *irq_data = irq_get_irq_data(irq); + int node = ioapic_alloc_attr_node(info); + + /* + * Legacy ISA IRQ has already been allocated, just add pin to + * the pin list assoicated with this IRQ and program the IOAPIC + * entry. The IOAPIC entry + */ + if (irq_data && irq_data->parent_data) { + if (!mp_check_pin_attr(irq, info)) + return -EBUSY; + if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, + info->ioapic_pin)) + return -ENOMEM; + } else { + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + if (irq >= 0) { + irq_data = irq_domain_get_irq_data(domain, irq); + data = irq_data->chip_data; + data->isa_irq = true; + } } - return irq > 0 ? irq : -1; + return irq; } static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, - unsigned int flags) + unsigned int flags, struct irq_alloc_info *info) { int irq; + bool legacy = false; + struct irq_alloc_info tmp; + struct mp_chip_data *data; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); - struct mp_pin_info *info = mp_pin_info(ioapic, pin); if (!domain) - return -1; + return -ENOSYS; - mutex_lock(&ioapic_mutex); - - /* - * Don't use irqdomain to manage ISA IRQs because there may be - * multiple IOAPIC pins sharing the same ISA IRQ number and - * irqdomain only supports 1:1 mapping between IOAPIC pin and - * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used - * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). - * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are - * available, and some BIOSes may use MP Interrupt Source records - * to override IRQ numbers for PIRQs instead of reprogramming - * the interrupt routing logic. Thus there may be multiple pins - * sharing the same legacy IRQ number when ACPI is disabled. - */ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; - if (flags & IOAPIC_MAP_ALLOC) { - if (info->count == 0 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; + legacy = mp_is_legacy_irq(irq); + } - /* special handling for timer IRQ0 */ + mutex_lock(&ioapic_mutex); + if (!(flags & IOAPIC_MAP_ALLOC)) { + if (!legacy) { + irq = irq_find_mapping(domain, pin); if (irq == 0) - info->count++; + irq = -ENOENT; } } else { - irq = irq_find_mapping(domain, pin); - if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) - irq = alloc_irq_from_domain(domain, gsi, pin); - } - - if (flags & IOAPIC_MAP_ALLOC) { - /* special handling for legacy IRQs */ - if (irq < nr_legacy_irqs() && info->count == 1 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; - - if (irq > 0) - info->count++; - else if (info->count == 0) - info->set = 0; + ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin); + if (legacy) + irq = alloc_isa_irq_from_domain(domain, irq, + ioapic, pin, &tmp); + else if ((irq = irq_find_mapping(domain, pin)) == 0) + irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp); + else if (!mp_check_pin_attr(irq, &tmp)) + irq = -EBUSY; + if (irq >= 0) { + data = irq_get_chip_data(irq); + data->count++; + } } - mutex_unlock(&ioapic_mutex); - return irq > 0 ? irq : -1; + return irq; } static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) @@ -1058,10 +1098,10 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) } #endif - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); } -int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) { int ioapic, pin, idx; @@ -1074,31 +1114,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) if ((flags & IOAPIC_MAP_CHECK) && idx < 0) return -1; - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); } void mp_unmap_irq(int irq) { - struct irq_data *data = irq_get_irq_data(irq); - struct mp_pin_info *info; - int ioapic, pin; + struct irq_data *irq_data = irq_get_irq_data(irq); + struct mp_chip_data *data; - if (!data || !data->domain) + if (!irq_data || !irq_data->domain) return; - ioapic = (int)(long)data->domain->host_data; - pin = (int)data->hwirq; - info = mp_pin_info(ioapic, pin); + data = irq_data->chip_data; + if (!data || data->isa_irq) + return; mutex_lock(&ioapic_mutex); - if (--info->count == 0) { - info->set = 0; - if (irq < nr_legacy_irqs() && - ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY) - mp_irqdomain_unmap(data->domain, irq); - else - irq_dispose_mapping(irq); - } + if (--data->count == 0) + irq_domain_free_irqs(irq, 1); mutex_unlock(&ioapic_mutex); } @@ -1165,7 +1198,7 @@ out: } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -static struct irq_chip ioapic_chip; +static struct irq_chip ioapic_chip, ioapic_ir_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) @@ -1189,96 +1222,6 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, - unsigned long trigger) -{ - struct irq_chip *chip = &ioapic_chip; - irq_flow_handler_t hdl; - bool fasteoi; - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { - irq_set_status_flags(irq, IRQ_LEVEL); - fasteoi = true; - } else { - irq_clear_status_flags(irq, IRQ_LEVEL); - fasteoi = false; - } - - if (setup_remapped_irq(irq, cfg, chip)) - fasteoi = trigger != 0; - - hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; - irq_set_chip_and_handler_name(irq, chip, hdl, - fasteoi ? "fasteoi" : "edge"); -} - -int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - memset(entry, 0, sizeof(*entry)); - - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = destination; - entry->vector = vector; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* - * Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - -static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, - struct io_apic_irq_attr *attr) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - if (!IO_APIC_IRQ(irq)) - return; - - if (assign_irq_vector(irq, cfg, apic->target_cpus())) - return; - - if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), - &dest)) { - pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i Dest:%d)\n", - attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, - cfg->vector, irq, attr->trigger, attr->polarity, dest); - - if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - ioapic_register_intr(irq, cfg, attr->trigger); - if (irq < nr_legacy_irqs()) - legacy_pic->mask(irq); - - ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); -} - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; @@ -1298,106 +1241,41 @@ static void __init setup_IO_APIC_irqs(void) } } -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), - apic->target_cpus(), &dest))) - dest = BAD_APICID; - - entry.dest_mode = apic->irq_dest_mode; - entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = dest; - entry.delivery_mode = apic->irq_delivery_mode; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_idx, pin, entry); -} - -void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) +void ioapic_zap_locks(void) { - int i; - - pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - pr_debug(" %02x %02X ", i, entry.dest); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector); - } + raw_spin_lock_init(&ioapic_lock); } -void intel_ir_io_apic_print_entries(unsigned int apic, - unsigned int nr_entries) +static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { int i; + char buf[256]; + struct IO_APIC_route_entry entry; + struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; - pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n"); - + printk(KERN_DEBUG "IOAPIC %d:\n", apic); for (i = 0; i <= nr_entries; i++) { - struct IR_IO_APIC_route_entry *ir_entry; - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, i); - - ir_entry = (struct IR_IO_APIC_route_entry *)&entry; - - pr_debug(" %02x %04X ", i, ir_entry->index); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %X %02X\n", - ir_entry->format, - ir_entry->mask, - ir_entry->trigger, - ir_entry->irr, - ir_entry->polarity, - ir_entry->delivery_status, - ir_entry->index2, - ir_entry->zero, - ir_entry->vector); + snprintf(buf, sizeof(buf), + " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, + entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", + entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", + entry.polarity == IOAPIC_POL_LOW ? "low " : "high", + entry.vector, entry.irr, entry.delivery_status); + if (ir_entry->format) + printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", + buf, (ir_entry->index << 15) | ir_entry->index, + ir_entry->zero); + else + printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", + buf, + entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? + "logical " : "physical", + entry.dest, entry.delivery_mode); } } -void ioapic_zap_locks(void) -{ - raw_spin_lock_init(&ioapic_lock); -} - static void __init print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -1451,16 +1329,13 @@ static void __init print_IO_APIC(int ioapic_idx) } printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries); + io_apic_print_entries(ioapic_idx, reg_01.bits.entries); } void __init print_IO_APICs(void) { int ioapic_idx; - struct irq_cfg *cfg; unsigned int irq; - struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for_each_ioapic(ioapic_idx) @@ -1480,18 +1355,20 @@ void __init print_IO_APICs(void) printk(KERN_DEBUG "IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; + struct irq_chip *chip; + struct mp_chip_data *data; chip = irq_get_chip(irq); - if (chip != &ioapic_chip) + if (chip != &ioapic_chip && chip != &ioapic_ir_chip) continue; - - cfg = irq_cfg(irq); - if (!cfg) + data = irq_get_chip_data(irq); + if (!data) continue; - if (list_empty(&cfg->irq_2_pin)) + if (list_empty(&data->irq_2_pin)) continue; + printk(KERN_DEBUG "IRQ%d ", irq); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) pr_cont("-> %d:%d", entry->apic, entry->pin); pr_cont("\n"); } @@ -1564,15 +1441,12 @@ void native_disable_io_apic(void) struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); + entry.mask = IOAPIC_UNMASKED; + entry.trigger = IOAPIC_EDGE; + entry.polarity = IOAPIC_POL_HIGH; + entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry.delivery_mode = dest_ExtINT; + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1582,7 +1456,6 @@ void native_disable_io_apic(void) if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(ioapic_i8259.pin != -1); - } /* @@ -1792,7 +1665,6 @@ static int __init timer_irq_works(void) * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... */ - static unsigned int startup_ioapic_irq(struct irq_data *data) { int was_pending = 0, irq = data->irq; @@ -1804,74 +1676,22 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) if (legacy_pic->irq_pending(irq)) was_pending = 1; } - __unmask_ioapic(irqd_cfg(data)); + __unmask_ioapic(data->chip_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - -int native_ioapic_set_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = apic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, irqd_cfg(data)); - ret = IRQ_SET_MASK_OK_NOCOPY; - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +static bool io_apic_level_ack_pending(struct mp_chip_data *data) { struct irq_pin_list *entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { unsigned int reg; int pin; @@ -1888,18 +1708,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic(cfg); + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { if (unlikely(masked)) { /* Only migrate the irq if the ack has been received. @@ -1928,31 +1747,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(cfg)) + if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic(cfg); + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { } #endif -static void ack_ioapic_level(struct irq_data *data) +static void ioapic_ack_level(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); - int i, irq = data->irq; + struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; bool masked; + int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(data, cfg); + masked = ioapic_irqd_mask(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -2004,11 +1822,49 @@ static void ack_ioapic_level(struct irq_data *data) */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); + eoi_ioapic_pin(cfg->vector, irq_data->chip_data); + } + + ioapic_irqd_unmask(irq_data, masked); +} + +static void ioapic_ir_ack_level(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + ack_APIC_irq(); + eoi_ioapic_pin(data->entry.vector, data); +} - eoi_ioapic_irq(irq, cfg); +static int ioapic_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct mp_chip_data *data = irq_data->chip_data; + struct irq_pin_list *entry; + struct irq_cfg *cfg; + unsigned long flags; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + cfg = irqd_cfg(irq_data); + data->entry.dest = cfg->dest_apicid; + data->entry.vector = cfg->vector; + for_each_irq_pin(entry, data->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, + data->entry); } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); - ioapic_irqd_unmask(data, cfg, masked); + return ret; } static struct irq_chip ioapic_chip __read_mostly = { @@ -2016,10 +1872,20 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_startup = startup_ioapic_irq, .irq_mask = mask_ioapic_irq, .irq_unmask = unmask_ioapic_irq, - .irq_ack = apic_ack_edge, - .irq_eoi = ack_ioapic_level, - .irq_set_affinity = native_ioapic_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct irq_chip ioapic_ir_chip __read_mostly = { + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ir_ack_level, + .irq_set_affinity = ioapic_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2113,12 +1979,12 @@ static inline void __init unlock_ExtINT_logic(void) memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ + entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry1.mask = IOAPIC_UNMASKED; entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; - entry1.trigger = 0; + entry1.trigger = IOAPIC_EDGE; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2152,6 +2018,25 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); +static int mp_alloc_timer_irq(int ioapic, int pin) +{ + int irq = -1; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (domain) { + struct irq_alloc_info info; + + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); + info.ioapic_id = mpc_ioapic_id(ioapic); + info.ioapic_pin = pin; + mutex_lock(&ioapic_mutex); + irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); + mutex_unlock(&ioapic_mutex); + } + + return irq; +} + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2162,7 +2047,9 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup); */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_data *irq_data = irq_get_irq_data(0); + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2174,7 +2061,6 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ legacy_pic->mask(0); - assign_irq_vector(0, cfg, apic->target_cpus()); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2215,23 +2101,21 @@ static inline void __init check_timer(void) } if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ + /* Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_node(cfg, node, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + mp_alloc_timer_irq(apic1, pin1); } else { - /* for edge trigger, setup_ioapic_irq already - * leave it unmasked. + /* + * for edge trigger, it's already unmasked, * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic(cfg); + unmask_ioapic_irq(irq_get_chip_data(0)); } + irq_domain_activate_irq(irq_data); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2251,8 +2135,8 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_activate_irq(irq_data); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2329,36 +2213,35 @@ out: static int mp_irqdomain_create(int ioapic) { - size_t size; + struct irq_alloc_info info; + struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); - size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic); - ip->pin_info = kzalloc(size, GFP_KERNEL); - if (!ip->pin_info) - return -ENOMEM; - if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info.ioapic_id = mpc_ioapic_id(ioapic); + parent = irq_remapping_get_ir_irq_domain(&info); + if (!parent) + parent = x86_vector_domain; + ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, (void *)(long)ioapic); - if(!ip->irqdomain) { - kfree(ip->pin_info); - ip->pin_info = NULL; + if (!ip->irqdomain) return -ENOMEM; - } + + ip->irqdomain->parent = parent; if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1); - if (gsi_cfg->gsi_base == 0) - irq_set_default_host(ip->irqdomain); - return 0; } @@ -2368,8 +2251,6 @@ static void ioapic_destroy_irqdomain(int idx) irq_domain_remove(ioapics[idx].irqdomain); ioapics[idx].irqdomain = NULL; } - kfree(ioapics[idx].pin_info); - ioapics[idx].pin_info = NULL; } void __init setup_IO_APIC(void) @@ -2399,20 +2280,6 @@ void __init setup_IO_APIC(void) ioapic_initialized = 1; } -/* - * Called after all the initialization is done. If we didn't find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - static void resume_ioapic_id(int ioapic_idx) { unsigned long flags; @@ -2451,20 +2318,6 @@ static int __init ioapic_init_ops(void) device_initcall(ioapic_init_ops); -static int -io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) -{ - struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); - int ret; - - if (!cfg) - return -EINVAL; - ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); - if (!ret) - setup_ioapic_irq(irq, cfg, attr); - return ret; -} - static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -2692,7 +2545,7 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - x86_io_apic_ops.set_affinity(idata, mask, false); + irq_set_affinity(irq, mask); } } @@ -2737,7 +2590,7 @@ static struct resource * __init ioapic_setup_resources(void) return res; } -void __init native_io_apic_init_mappings(void) +void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; @@ -2962,7 +2815,6 @@ int mp_unregister_ioapic(u32 gsi_base) { int ioapic, pin; int found = 0; - struct mp_pin_info *pin_info; for_each_ioapic(ioapic) if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { @@ -2975,11 +2827,17 @@ int mp_unregister_ioapic(u32 gsi_base) } for_each_pin(ioapic, pin) { - pin_info = mp_pin_info(ioapic, pin); - if (pin_info->count) { - pr_warn("pin%d on IOAPIC%d is still in use.\n", - pin, ioapic); - return -EBUSY; + u32 gsi = mp_pin_to_gsi(ioapic, pin); + int irq = mp_map_gsi_to_irq(gsi, 0, NULL); + struct mp_chip_data *data; + + if (irq >= 0) { + data = irq_get_chip_data(irq); + if (data && data->count) { + pr_warn("pin%d on IOAPIC%d is still in use.\n", + pin, ioapic); + return -EBUSY; + } } } @@ -3006,108 +2864,141 @@ int mp_ioapic_registered(u32 gsi_base) return 0; } -static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, - int ioapic, int ioapic_pin, - int trigger, int polarity) +static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, + struct irq_alloc_info *info) { - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; + if (info && info->ioapic_valid) { + data->trigger = info->ioapic_trigger; + data->polarity = info->ioapic_polarity; + } else if (acpi_get_override_irq(gsi, &data->trigger, + &data->polarity) < 0) { + /* PCI interrupts are always active low level triggered. */ + data->trigger = IOAPIC_LEVEL; + data->polarity = IOAPIC_POL_LOW; + } } -int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) +static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, + struct IO_APIC_route_entry *entry) { - int ioapic = (int)(long)domain->host_data; - struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); - struct io_apic_irq_attr attr; + memset(entry, 0, sizeof(*entry)); + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = cfg->dest_apicid; + entry->vector = cfg->vector; + entry->trigger = data->trigger; + entry->polarity = data->polarity; + /* + * Mask level triggered irqs. Edge triggered irqs are masked + * by the irq core code in case they fire. + */ + if (data->trigger == IOAPIC_LEVEL) + entry->mask = IOAPIC_MASKED; + else + entry->mask = IOAPIC_UNMASKED; +} - /* Get default attribute if not set by caller yet */ - if (!info->set) { - u32 gsi = mp_pin_to_gsi(ioapic, hwirq); +int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + int ret, ioapic, pin; + struct irq_cfg *cfg; + struct irq_data *irq_data; + struct mp_chip_data *data; + struct irq_alloc_info *info = arg; - if (acpi_get_override_irq(gsi, &info->trigger, - &info->polarity) < 0) { - /* - * PCI interrupts are always polarity one level - * triggered. - */ - info->trigger = 1; - info->polarity = 1; - } - info->node = NUMA_NO_NODE; + if (!info || nr_irqs > 1) + return -EINVAL; + irq_data = irq_domain_get_irq_data(domain, virq); + if (!irq_data) + return -EINVAL; - /* - * setup_IO_APIC_irqs() programs all legacy IRQs with default - * trigger and polarity attributes. Don't set the flag for that - * case so the first legacy IRQ user could reprogram the pin - * with real trigger and polarity attributes. - */ - if (virq >= nr_legacy_irqs() || info->count) - info->set = 1; - } - set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger, - info->polarity); + ioapic = mp_irqdomain_ioapic_idx(domain); + pin = info->ioapic_pin; + if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + return -EEXIST; - return io_apic_setup_irq_pin(virq, info->node, &attr); -} + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; -void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) -{ - struct irq_data *data = irq_get_irq_data(virq); - struct irq_cfg *cfg = irq_cfg(virq); - int ioapic = (int)(long)domain->host_data; - int pin = (int)data->hwirq; + info->ioapic_entry = &data->entry; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(data); + return ret; + } + + INIT_LIST_HEAD(&data->irq_2_pin); + irq_data->hwirq = info->ioapic_pin; + irq_data->chip = (domain->parent == x86_vector_domain) ? + &ioapic_chip : &ioapic_ir_chip; + irq_data->chip_data = data; + mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); + + cfg = irqd_cfg(irq_data); + add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + if (info->ioapic_entry) + mp_setup_entry(cfg, data, info->ioapic_entry); + mp_register_handler(virq, data->trigger); + if (virq < nr_legacy_irqs()) + legacy_pic->mask(virq); + + apic_printk(APIC_VERBOSE, KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", + ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, + virq, data->trigger, data->polarity, cfg->dest_apicid); - ioapic_mask_entry(ioapic, pin); - __remove_pin_from_irq(cfg, ioapic, pin); - WARN_ON(!list_empty(&cfg->irq_2_pin)); - arch_teardown_hwirq(virq); + return 0; } -int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) +void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - int ret = 0; - int ioapic, pin; - struct mp_pin_info *info; + struct irq_data *irq_data; + struct mp_chip_data *data; - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -ENODEV; - - pin = mp_find_ioapic_pin(ioapic, gsi); - info = mp_pin_info(ioapic, pin); - trigger = trigger ? 1 : 0; - polarity = polarity ? 1 : 0; - - mutex_lock(&ioapic_mutex); - if (!info->set) { - info->trigger = trigger; - info->polarity = polarity; - info->node = node; - info->set = 1; - } else if (info->trigger != trigger || info->polarity != polarity) { - ret = -EBUSY; + BUG_ON(nr_irqs != 1); + irq_data = irq_domain_get_irq_data(domain, virq); + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); + WARN_ON(!list_empty(&data->irq_2_pin)); + kfree(irq_data->chip_data); } - mutex_unlock(&ioapic_mutex); - - return ret; + irq_domain_free_irqs_top(domain, virq, nr_irqs); } -/* Enable IOAPIC early just for system timer */ -void __init pre_init_apic_IRQ0(void) +void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; + unsigned long flags; + struct irq_pin_list *entry; + struct mp_chip_data *data = irq_data->chip_data; - printk(KERN_INFO "Early APIC setup for system timer0\n"); -#ifndef CONFIG_SMP - physid_set_mask_of_physid(boot_cpu_physical_apicid, - &phys_cpu_present_map); -#endif - setup_local_APIC(); + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, data->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, data->entry); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} - io_apic_setup_irq_pin(0, 0, &attr); - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); +void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + /* It won't be called for IRQ with multiple IOAPIC pins associated */ + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); +} + +int mp_irqdomain_ioapic_idx(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; } + +const struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d6ba2d660dc5..1a9d735e09c6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu <jiang.liu@linux.intel.com> + * Convert to hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -14,22 +16,23 @@ #include <linux/dmar.h> #include <linux/hpet.h> #include <linux/msi.h> +#include <asm/irqdomain.h> #include <asm/msidef.h> #include <asm/hpet.h> #include <asm/hw_irq.h> #include <asm/apic.h> #include <asm/irq_remapping.h> -void native_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +static struct irq_domain *msi_default_domain; + +static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_cfg *cfg = irqd_cfg(data); msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); msg->address_lo = MSI_ADDR_BASE_LO | @@ -39,7 +42,7 @@ void native_compose_msi_msg(struct pci_dev *pdev, ((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU : MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | @@ -50,180 +53,201 @@ void native_compose_msi_msg(struct pci_dev *pdev, MSI_DATA_VECTOR(cfg->vector); } -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, - struct msi_msg *msg, u8 hpet_id) +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip pci_msi_controller = { + .name = "PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - struct irq_cfg *cfg; - int err; - unsigned dest; + struct irq_domain *domain; + struct irq_alloc_info info; - if (disable_apic) - return -ENXIO; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_MSI; + info.msi_dev = dev; - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; + domain = irq_remapping_get_irq_domain(&info); + if (domain == NULL) + domain = msi_default_domain; + if (domain == NULL) + return -ENOSYS; - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; + return pci_msi_domain_alloc_irqs(domain, dev, nvec, type); +} - x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); +void native_teardown_msi_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); +} - return 0; +static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->msi_hwirq; } -static int -msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) +static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) { - struct irq_cfg *cfg = irqd_cfg(data); - struct msi_msg msg; - unsigned int dest; - int ret; + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + + init_irq_alloc_info(arg, NULL); + arg->msi_dev = pdev; + if (desc->msi_attrib.is_msix) { + arg->type = X86_IRQ_ALLOC_TYPE_MSIX; + } else { + arg->type = X86_IRQ_ALLOC_TYPE_MSI; + arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + } - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; + return 0; +} - __get_cached_msi_msg(data->msi_desc, &msg); +static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) +{ + arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); +} + +static struct msi_domain_ops pci_msi_domain_ops = { + .get_hwirq = pci_msi_get_hwirq, + .msi_prepare = pci_msi_prepare, + .set_desc = pci_msi_set_desc, +}; - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); +static struct msi_domain_info pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; - __pci_write_msi_msg(data->msi_desc, &msg); +void arch_init_msi_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; - return IRQ_SET_MASK_OK_NOCOPY; + msi_default_domain = pci_msi_create_irq_domain(NULL, + &pci_msi_domain_info, parent); + if (!msi_default_domain) + pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); } -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", +#ifdef CONFIG_IRQ_REMAP +static struct irq_chip pci_msi_ir_controller = { + .name = "IR-PCI-MSI", .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, - .irq_ack = apic_ack_edge, - .irq_set_affinity = msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset) -{ - struct irq_chip *chip = &msi_chip; - struct msi_msg msg; - unsigned int irq = irq_base + irq_offset; - int ret; - - ret = msi_compose_msg(dev, irq, &msg, -1); - if (ret < 0) - return ret; - - irq_set_msi_desc_off(irq_base, irq_offset, msidesc); - - /* - * MSI-X message is written per-IRQ, the offset is always 0. - * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. - */ - if (!irq_offset) - pci_write_msi_msg(irq, &msg); +static struct msi_domain_info pci_msi_ir_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_ir_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; - setup_remapped_irq(irq, irq_cfg(irq), chip); +struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) +{ + return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent); +} +#endif - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); +#ifdef CONFIG_DMAR_TABLE +static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + dmar_msi_write(data->irq, msg); +} - dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq); +static struct irq_chip dmar_msi_controller = { + .name = "DMAR-MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = dmar_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; - return 0; +static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->dmar_id; } -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +static int dmar_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - struct msi_desc *msidesc; - unsigned int irq; - int node, ret; + irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL, + handle_edge_irq, arg->dmar_data, "edge"); - /* Multiple MSI vectors only supported with interrupt remapping */ - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; + return 0; +} - node = dev_to_node(&dev->dev); +static struct msi_domain_ops dmar_msi_domain_ops = { + .get_hwirq = dmar_msi_get_hwirq, + .msi_init = dmar_msi_init, +}; - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_alloc_hwirq(node); - if (!irq) - return -ENOSPC; +static struct msi_domain_info dmar_msi_domain_info = { + .ops = &dmar_msi_domain_ops, + .chip = &dmar_msi_controller, +}; - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) { - irq_free_hwirq(irq); - return ret; - } +static struct irq_domain *dmar_get_irq_domain(void) +{ + static struct irq_domain *dmar_domain; + static DEFINE_MUTEX(dmar_lock); - } - return 0; -} + mutex_lock(&dmar_lock); + if (dmar_domain == NULL) + dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info, + x86_vector_domain); + mutex_unlock(&dmar_lock); -void native_teardown_msi_irq(unsigned int irq) -{ - irq_free_hwirq(irq); + return dmar_domain; } -#ifdef CONFIG_DMAR_TABLE -static int -dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) +int dmar_alloc_hwirq(int id, int node, void *arg) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest, irq = data->irq; - struct msi_msg msg; - int ret; - - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; + struct irq_domain *domain = dmar_get_irq_domain(); + struct irq_alloc_info info; - dmar_msi_read(irq, &msg); + if (!domain) + return -1; - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_DMAR; + info.dmar_id = id; + info.dmar_data = arg; - dmar_msi_write(irq, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; + return irq_domain_alloc_irqs(domain, 1, node, &info); } -static struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .irq_unmask = dmar_msi_unmask, - .irq_mask = dmar_msi_mask, - .irq_ack = apic_ack_edge, - .irq_set_affinity = dmar_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -int arch_setup_dmar_msi(unsigned int irq) +void dmar_free_hwirq(int irq) { - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(NULL, irq, &msg, -1); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; + irq_domain_free_irqs(irq, 1); } #endif @@ -231,56 +255,103 @@ int arch_setup_dmar_msi(unsigned int irq) * MSI message composition */ #ifdef CONFIG_HPET_TIMER +static inline int hpet_dev_id(struct irq_domain *domain) +{ + struct msi_domain_info *info = msi_get_domain_info(domain); + + return (int)(long)info->data; +} -static int hpet_msi_set_affinity(struct irq_data *data, - const struct cpumask *mask, bool force) +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { - struct irq_cfg *cfg = irqd_cfg(data); - struct msi_msg msg; - unsigned int dest; - int ret; + hpet_msi_write(data->handler_data, msg); +} - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; +static struct irq_chip hpet_msi_controller = { + .name = "HPET-MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = hpet_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; - hpet_msi_read(data->handler_data, &msg); +static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->hpet_index; +} - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL, + handle_edge_irq, arg->hpet_data, "edge"); - hpet_msi_write(data->handler_data, &msg); + return 0; +} - return IRQ_SET_MASK_OK_NOCOPY; +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); } -static struct irq_chip hpet_msi_type = { - .name = "HPET_MSI", - .irq_unmask = hpet_msi_unmask, - .irq_mask = hpet_msi_mask, - .irq_ack = apic_ack_edge, - .irq_set_affinity = hpet_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, - .flags = IRQCHIP_SKIP_SET_WAKE, +static struct msi_domain_ops hpet_msi_domain_ops = { + .get_hwirq = hpet_msi_get_hwirq, + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, }; -int default_setup_hpet_msi(unsigned int irq, unsigned int id) +struct irq_domain *hpet_create_irq_domain(int hpet_id) { - struct irq_chip *chip = &hpet_msi_type; - struct msi_msg msg; - int ret; + struct irq_domain *parent; + struct irq_alloc_info info; + struct msi_domain_info *domain_info; + + if (x86_vector_domain == NULL) + return NULL; + + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_id = hpet_id; + parent = irq_remapping_get_ir_irq_domain(&info); + if (parent == NULL) + parent = x86_vector_domain; + else + hpet_msi_controller.name = "IR-HPET-MSI"; + + return msi_create_irq_domain(NULL, domain_info, parent); +} - ret = msi_compose_msg(NULL, irq, &msg, id); - if (ret < 0) - return ret; +int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, + int dev_num) +{ + struct irq_alloc_info info; - hpet_msi_write(irq_get_handler_data(irq), &msg); - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - setup_remapped_irq(irq, irq_cfg(irq), chip); + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_data = dev; + info.hpet_id = hpet_dev_id(domain); + info.hpet_index = dev_num; - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); - return 0; + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); } #endif diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6cedd7914581..28eba2d38b15 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu <jiang.liu@linux.intel.com> + * Enable support of hierarchical irqdomains * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -11,15 +13,28 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compiler.h> -#include <linux/irqdomain.h> #include <linux/slab.h> +#include <asm/irqdomain.h> #include <asm/hw_irq.h> #include <asm/apic.h> #include <asm/i8259.h> #include <asm/desc.h> #include <asm/irq_remapping.h> +struct apic_chip_data { + struct irq_cfg cfg; + cpumask_var_t domain; + cpumask_var_t old_domain; + u8 move_in_progress : 1; +}; + +struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); +static cpumask_var_t vector_cpumask; +static struct irq_chip lapic_controller; +#ifdef CONFIG_X86_IO_APIC +static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; +#endif void lock_vector_lock(void) { @@ -34,71 +49,59 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } -struct irq_cfg *irq_cfg(unsigned int irq) +static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data) { - return irq_get_chip_data(irq); + if (!irq_data) + return NULL; + + while (irq_data->parent_data) + irq_data = irq_data->parent_data; + + return irq_data->chip_data; } struct irq_cfg *irqd_cfg(struct irq_data *irq_data) { - return irq_data->chip_data; + struct apic_chip_data *data = apic_chip_data(irq_data); + + return data ? &data->cfg : NULL; } -static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) +struct irq_cfg *irq_cfg(unsigned int irq) { - struct irq_cfg *cfg; + return irqd_cfg(irq_get_irq_data(irq)); +} - cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); - if (!cfg) +static struct apic_chip_data *alloc_apic_chip_data(int node) +{ + struct apic_chip_data *data; + + data = kzalloc_node(sizeof(*data), GFP_KERNEL, node); + if (!data) return NULL; - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) - goto out_cfg; - if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node)) + goto out_data; + if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node)) goto out_domain; -#ifdef CONFIG_X86_IO_APIC - INIT_LIST_HEAD(&cfg->irq_2_pin); -#endif - return cfg; + return data; out_domain: - free_cpumask_var(cfg->domain); -out_cfg: - kfree(cfg); + free_cpumask_var(data->domain); +out_data: + kfree(data); return NULL; } -struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) +static void free_apic_chip_data(struct apic_chip_data *data) { - int res = irq_alloc_desc_at(at, node); - struct irq_cfg *cfg; - - if (res < 0) { - if (res != -EEXIST) - return NULL; - cfg = irq_cfg(at); - if (cfg) - return cfg; + if (data) { + free_cpumask_var(data->domain); + free_cpumask_var(data->old_domain); + kfree(data); } - - cfg = alloc_irq_cfg(at, node); - if (cfg) - irq_set_chip_data(at, cfg); - else - irq_free_desc(at); - return cfg; -} - -static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) -{ - if (!cfg) - return; - irq_set_chip_data(at, NULL); - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); } -static int -__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int __assign_irq_vector(int irq, struct apic_chip_data *d, + const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@ -114,36 +117,33 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 16; int cpu, err; - cpumask_var_t tmp_mask; - if (cfg->move_in_progress) + if (d->move_in_progress) return -EBUSY; - if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) - return -ENOMEM; - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - cpumask_clear(cfg->old_domain); + cpumask_clear(d->old_domain); cpu = cpumask_first_and(mask, cpu_online_mask); while (cpu < nr_cpu_ids) { int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask, mask); + apic->vector_allocation_domain(cpu, vector_cpumask, mask); - if (cpumask_subset(tmp_mask, cfg->domain)) { + if (cpumask_subset(vector_cpumask, d->domain)) { err = 0; - if (cpumask_equal(tmp_mask, cfg->domain)) + if (cpumask_equal(vector_cpumask, d->domain)) break; /* * New cpumask using the vector is a proper subset of * the current in use mask. So cleanup the vector * allocation for the members that are not used anymore. */ - cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); - cpumask_and(cfg->domain, cfg->domain, tmp_mask); + cpumask_andnot(d->old_domain, d->domain, + vector_cpumask); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); + cpumask_and(d->domain, d->domain, vector_cpumask); break; } @@ -157,16 +157,18 @@ next: } if (unlikely(current_vector == vector)) { - cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, cfg->old_domain); - cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + cpumask_or(d->old_domain, d->old_domain, + vector_cpumask); + cpumask_andnot(vector_cpumask, mask, d->old_domain); + cpu = cpumask_first_and(vector_cpumask, + cpu_online_mask); continue; } if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) { if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) goto next; @@ -174,55 +176,73 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (cfg->vector) { - cpumask_copy(cfg->old_domain, cfg->domain); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); + if (d->cfg.vector) { + cpumask_copy(d->old_domain, d->domain); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); } - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cpumask_copy(cfg->domain, tmp_mask); + d->cfg.vector = vector; + cpumask_copy(d->domain, vector_cpumask); err = 0; break; } - free_cpumask_var(tmp_mask); + + if (!err) { + /* cache destination APIC IDs into cfg->dest_apicid */ + err = apic->cpu_mask_to_apicid_and(mask, d->domain, + &d->cfg.dest_apicid); + } return err; } -int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int assign_irq_vector(int irq, struct apic_chip_data *data, + const struct cpumask *mask) { int err; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, cfg, mask); + err = __assign_irq_vector(irq, data, mask); raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } -void clear_irq_vector(int irq, struct irq_cfg *cfg) +static int assign_irq_vector_policy(int irq, int node, + struct apic_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->mask) + return assign_irq_vector(irq, data, info->mask); + if (node != NUMA_NO_NODE && + assign_irq_vector(irq, data, cpumask_of_node(node)) == 0) + return 0; + return assign_irq_vector(irq, data, apic->target_cpus()); +} + +static void clear_irq_vector(int irq, struct apic_chip_data *data) { int cpu, vector; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - BUG_ON(!cfg->vector); + BUG_ON(!data->cfg.vector); - vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + vector = data->cfg.vector; + for_each_cpu_and(cpu, data->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; - cfg->vector = 0; - cpumask_clear(cfg->domain); + data->cfg.vector = 0; + cpumask_clear(data->domain); - if (likely(!cfg->move_in_progress)) { + if (likely(!data->move_in_progress)) { raw_spin_unlock_irqrestore(&vector_lock, flags); return; } - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -231,10 +251,95 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg) break; } } - cfg->move_in_progress = 0; + data->move_in_progress = 0; raw_spin_unlock_irqrestore(&vector_lock, flags); } +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +{ + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static void x86_vector_free_irqs(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irq_data && irq_data->chip_data) { + clear_irq_vector(virq + i, irq_data->chip_data); + free_apic_chip_data(irq_data->chip_data); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs()) + legacy_irq_data[virq + i] = NULL; +#endif + irq_domain_reset_irq_data(irq_data); + } + } +} + +static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + struct apic_chip_data *data; + struct irq_data *irq_data; + int i, err; + + if (disable_apic) + return -ENXIO; + + /* Currently vector allocator can't guarantee contiguous allocations */ + if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) + return -ENOSYS; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irq_data); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) + data = legacy_irq_data[virq + i]; + else +#endif + data = alloc_apic_chip_data(irq_data->node); + if (!data) { + err = -ENOMEM; + goto error; + } + + irq_data->chip = &lapic_controller; + irq_data->chip_data = data; + irq_data->hwirq = virq + i; + err = assign_irq_vector_policy(virq, irq_data->node, data, + info); + if (err) + goto error; + } + + return 0; + +error: + x86_vector_free_irqs(domain, virq, i + 1); + return err; +} + +static const struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, +}; + int __init arch_probe_nr_irqs(void) { int nr; @@ -258,8 +363,43 @@ int __init arch_probe_nr_irqs(void) return nr_legacy_irqs(); } +#ifdef CONFIG_X86_IO_APIC +static void init_legacy_irqs(void) +{ + int i, node = cpu_to_node(0); + struct apic_chip_data *data; + + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * ISA_IRQ_VECTOR(i) for all cpu's. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + data = legacy_irq_data[i] = alloc_apic_chip_data(node); + BUG_ON(!data); + + data->cfg.vector = ISA_IRQ_VECTOR(i); + cpumask_setall(data->domain); + irq_set_chip_data(i, data); + } +} +#else +static void init_legacy_irqs(void) { } +#endif + int __init arch_early_irq_init(void) { + init_legacy_irqs(); + + x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, + NULL); + BUG_ON(x86_vector_domain == NULL); + irq_set_default_host(x86_vector_domain); + + arch_init_msi_domain(x86_vector_domain); + arch_init_htirq_domain(x86_vector_domain); + + BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); + return arch_early_ioapic_init(); } @@ -267,7 +407,7 @@ static void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ int irq, vector; - struct irq_cfg *cfg; + struct apic_chip_data *data; /* * vector_lock will make sure that we don't run into irq vector @@ -277,13 +417,13 @@ static void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!data) continue; - if (!cpumask_test_cpu(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, data->domain)) continue; - vector = cfg->vector; + vector = data->cfg.vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -292,8 +432,8 @@ static void __setup_vector_irq(int cpu) if (irq <= VECTOR_UNDEFINED) continue; - cfg = irq_cfg(irq); - if (!cpumask_test_cpu(cpu, cfg->domain)) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!cpumask_test_cpu(cpu, data->domain)) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } raw_spin_unlock(&vector_lock); @@ -314,20 +454,20 @@ void setup_vector_irq(int cpu) * legacy vector to irq mapping: */ for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; + per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq; __setup_vector_irq(cpu); } -int apic_retrigger_irq(struct irq_data *data) +static int apic_retrigger_irq(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); + struct apic_chip_data *data = apic_chip_data(irq_data); unsigned long flags; int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(cfg->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); + cpu = cpumask_first_and(data->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -340,73 +480,76 @@ void apic_ack_edge(struct irq_data *data) ack_APIC_irq(); } -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) +static int apic_set_affinity(struct irq_data *irq_data, + const struct cpumask *dest, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int irq = data->irq; - int err; + struct apic_chip_data *data = irq_data->chip_data; + int err, irq = irq_data->irq; if (!config_enabled(CONFIG_SMP)) return -EPERM; - if (!cpumask_intersects(mask, cpu_online_mask)) + if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + err = assign_irq_vector(irq, data, dest); if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) + struct irq_data *top = irq_get_irq_data(irq); + + if (assign_irq_vector(irq, data, top->affinity)) pr_err("Failed to recover vector for irq %d\n", irq); return err; } - cpumask_copy(data->affinity, mask); - - return 0; + return IRQ_SET_MASK_OK; } +static struct irq_chip lapic_controller = { + .irq_ack = apic_ack_edge, + .irq_set_affinity = apic_set_affinity, + .irq_retrigger = apic_retrigger_irq, +}; + #ifdef CONFIG_SMP -void send_cleanup_vector(struct irq_cfg *cfg) +static void __send_cleanup_vector(struct apic_chip_data *data) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + for_each_cpu_and(i, data->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } - cfg->move_in_progress = 0; + data->move_in_progress = 0; +} + +void send_cleanup_vector(struct irq_cfg *cfg) +{ + struct apic_chip_data *data; + + data = container_of(cfg, struct apic_chip_data, cfg); + if (data->move_in_progress) + __send_cleanup_vector(data); } asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; - ack_APIC_irq(); - irq_enter(); - exit_idle(); + entering_ack_irq(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { int irq; unsigned int irr; struct irq_desc *desc; - struct irq_cfg *cfg; + struct apic_chip_data *data; irq = __this_cpu_read(vector_irq[vector]); @@ -417,8 +560,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) if (!desc) continue; - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(&desc->irq_data); + if (!data) continue; raw_spin_lock(&desc->lock); @@ -427,10 +570,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) * Check if the irq migration is in progress. If so, we * haven't received the cleanup request yet for this irq. */ - if (cfg->move_in_progress) + if (data->move_in_progress) goto unlock; - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + if (vector == data->cfg.vector && + cpumask_test_cpu(me, data->domain)) goto unlock; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -450,20 +594,21 @@ unlock: raw_spin_unlock(&desc->lock); } - irq_exit(); + exiting_irq(); } static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { unsigned me; + struct apic_chip_data *data; - if (likely(!cfg->move_in_progress)) + data = container_of(cfg, struct apic_chip_data, cfg); + if (likely(!data->move_in_progress)) return; me = smp_processor_id(); - - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - send_cleanup_vector(cfg); + if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) + __send_cleanup_vector(data); } void irq_complete_move(struct irq_cfg *cfg) @@ -475,46 +620,11 @@ void irq_force_complete_move(int irq) { struct irq_cfg *cfg = irq_cfg(irq); - if (!cfg) - return; - - __irq_complete_move(cfg, cfg->vector); + if (cfg) + __irq_complete_move(cfg, cfg->vector); } #endif -/* - * Dynamic irq allocate and deallocation. Should be replaced by irq domains! - */ -int arch_setup_hwirq(unsigned int irq, int node) -{ - struct irq_cfg *cfg; - unsigned long flags; - int ret; - - cfg = alloc_irq_cfg(irq, node); - if (!cfg) - return -ENOMEM; - - raw_spin_lock_irqsave(&vector_lock, flags); - ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); - raw_spin_unlock_irqrestore(&vector_lock, flags); - - if (!ret) - irq_set_chip_data(irq, cfg); - else - free_irq_cfg(irq, cfg); - return ret; -} - -void arch_teardown_hwirq(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - free_remapped_irq(irq); - clear_irq_vector(irq, cfg); - free_irq_cfg(irq, cfg); -} - static void __init print_APIC_field(int base) { int i; diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6fae733e9194..3ffd925655e0 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode); static bool x2apic_fadt_phys(void) { +#ifdef CONFIG_ACPI if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "System requires x2apic physical mode\n"); return true; } +#endif return false; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9f6b9341950f..8e3d22a1af94 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -41,6 +41,25 @@ void common(void) { OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + BLANK(); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); + + BLANK(); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + + BLANK(); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); @@ -49,7 +68,9 @@ void common(void) { OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); +#ifdef CONFIG_X86_32 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); +#endif OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 47703aed74cf..6ce39025f467 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -17,17 +17,6 @@ void foo(void); void foo(void) { - OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); - OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); - OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); - OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); - OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); - OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); - OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); - OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); - OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); - BLANK(); - OFFSET(CPUINFO_x86, cpuinfo_x86, x86); OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); @@ -37,10 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_sysenter_return, thread_info, sysenter_return); - OFFSET(TI_cpu, thread_info, cpu); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); @@ -60,9 +45,6 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); - BLANK(); - OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 5ce6f2da8763..dcaab87da629 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -29,27 +29,6 @@ int main(void) BLANK(); #endif -#ifdef CONFIG_IA32_EMULATION - OFFSET(TI_sysenter_return, thread_info, sysenter_return); - BLANK(); - -#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) - ENTRY(ax); - ENTRY(bx); - ENTRY(cx); - ENTRY(dx); - ENTRY(si); - ENTRY(di); - ENTRY(bp); - ENTRY(sp); - ENTRY(ip); - BLANK(); -#undef ENTRY - - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); - BLANK(); -#endif - #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(cx); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e4cf63301ff4..94e7051fba1a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -520,8 +520,16 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) - /* check CPU config space for extended APIC ID */ - if (cpu_has_apic && c->x86 >= 0xf) { + /* + * ApicID can always be treated as an 8-bit value for AMD APIC versions + * >= 0x10, but even old K8s came out of reset with version 0x10. So, we + * can safely set X86_FEATURE_EXTD_APICID unconditionally for families + * after 16h. + */ + if (cpu_has_apic && c->x86 > 0x16) { + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } else if (cpu_has_apic && c->x86 >= 0xf) { + /* check CPU config space for extended APIC ID */ unsigned int val; val = read_pci_config(0, 24, 0, 0x68); if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 03445346ee0a..bd17db15a2c1 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -12,57 +12,11 @@ #include <asm/bugs.h> #include <asm/processor.h> #include <asm/processor-flags.h> -#include <asm/i387.h> +#include <asm/fpu/internal.h> #include <asm/msr.h> #include <asm/paravirt.h> #include <asm/alternative.h> -static double __initdata x = 4195835.0; -static double __initdata y = 3145727.0; - -/* - * This used to check for exceptions.. - * However, it turns out that to support that, - * the XMM trap handlers basically had to - * be buggy. So let's have a correct XMM trap - * handler, and forget about printing out - * some status at boot. - * - * We should really only care about bugs here - * anyway. Not features. - */ -static void __init check_fpu(void) -{ - s32 fdiv_bug; - - kernel_fpu_begin(); - - /* - * trap_init() enabled FXSR and company _before_ testing for FP - * problems here. - * - * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug - */ - __asm__("fninit\n\t" - "fldl %1\n\t" - "fdivl %2\n\t" - "fmull %2\n\t" - "fldl %1\n\t" - "fsubp %%st,%%st(1)\n\t" - "fistpl %0\n\t" - "fwait\n\t" - "fninit" - : "=m" (*&fdiv_bug) - : "m" (*&x), "m" (*&y)); - - kernel_fpu_end(); - - if (fdiv_bug) { - set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); - pr_warn("Hmm, FPU with FDIV bug\n"); - } -} - void __init check_bugs(void) { identify_boot_cpu(); @@ -85,10 +39,5 @@ void __init check_bugs(void) '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); - /* - * kernel_fpu_begin/end() in check_fpu() relies on the patched - * alternative instructions. - */ - if (cpu_has_fpu) - check_fpu(); + fpu__init_check_bugs(); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a62cf04dac8a..167f2f66bc49 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -31,8 +31,7 @@ #include <asm/setup.h> #include <asm/apic.h> #include <asm/desc.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> #include <asm/mtrr.h> #include <linux/numa.h> #include <asm/asm.h> @@ -145,33 +144,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); -static int __init x86_xsave_setup(char *s) -{ - if (strlen(s)) - return 0; - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - return 1; -} -__setup("noxsave", x86_xsave_setup); - -static int __init x86_xsaveopt_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - return 1; -} -__setup("noxsaveopt", x86_xsaveopt_setup); - -static int __init x86_xsaves_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - return 1; -} -__setup("noxsaves", x86_xsaves_setup); - #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; @@ -183,14 +155,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -static int __init x86_fxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_XMM); - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - static int __init x86_sep_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_SEP); @@ -759,7 +723,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); - fpu_detect(c); + fpu__init_system(c); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1155,10 +1119,6 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); -DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); - #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, @@ -1183,8 +1143,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); - /* * Special IST stacks which the CPU switches to when it calls * an IST-marked descriptor entry. Up to 7 stacks (hardware @@ -1275,7 +1233,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); /* * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find @@ -1439,7 +1396,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu_init(); + fpu__init_cpu(); if (is_uv_system()) uv_cpu_init(); @@ -1495,7 +1452,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu_init(); + fpu__init_cpu(); } #endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..4d450ac74e3d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -708,6 +708,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { int i, ret = 0; + char *tmp; for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); @@ -716,9 +717,11 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg, true) >= - MCE_PANIC_SEVERITY) + + if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + *msg = tmp; ret = 1; + } } return ret; } @@ -1637,10 +1640,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); mce_adjust_timer = cmci_intel_adjust_timer; break; - case X86_VENDOR_AMD: + + case X86_VENDOR_AMD: { + u32 ebx = cpuid_ebx(0x80000007); + mce_amd_feature_init(c); - mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; + mce_flags.overflow_recov = !!(ebx & BIT(0)); + mce_flags.succor = !!(ebx & BIT(1)); break; + } + default: break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 55ad9b37cae8..e99b15077e94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,19 +1,13 @@ /* - * (c) 2005-2012 Advanced Micro Devices, Inc. + * (c) 2005-2015 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. - * * Maintained by: Borislav Petkov <bp@alien8.de> * - * April 2006 - * - added support for AMD Family 0x10 processors - * May 2012 - * - major scrubbing - * - * All MC4_MISCi registers are shared between multi-cores + * All MC4_MISCi registers are shared between cores on a node. */ #include <linux/interrupt.h> #include <linux/notifier.h> @@ -32,6 +26,7 @@ #include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/trace/irq_vectors.h> #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -47,6 +42,13 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; /* * CPU Initialization @@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) threshold_restart_bank(&tr); }; -static int setup_APIC_mce(int reserved, int new) +static int setup_APIC_mce_threshold(int reserved, int new) { if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) @@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new) return reserved; } +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + wrmsr(MSR_CU_DEF_ERR, low, high); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.interrupt_enable = 1; new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); + offset = setup_APIC_mce_threshold(offset, new); if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) @@ -262,6 +304,73 @@ init: mce_threshold_block_init(&b, offset); } } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); +} + +static void __log_error(unsigned int bank, bool threshold_err, u64 misc) +{ + struct mce m; + u64 status; + + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + if (!(status & MCI_STATUS_VAL)) + return; + + mce_setup(&m); + + m.status = status; + m.bank = bank; + + if (threshold_err) + m.misc = misc; + + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr); + + mce_log(&m); + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); +} + +static inline void __smp_deferred_error_interrupt(void) +{ + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); +} + +asmlinkage __visible void smp_deferred_error_interrupt(void) +{ + entering_irq(); + __smp_deferred_error_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +{ + entering_irq(); + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + __smp_deferred_error_interrupt(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + u64 status; + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + + if (!(status & MCI_STATUS_VAL) || + !(status & MCI_STATUS_DEFERRED)) + continue; + + __log_error(bank, false, 0); + break; + } } /* @@ -273,12 +382,12 @@ init: * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ + static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; int cpu = smp_processor_id(); unsigned int bank, block; - struct mce m; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void) return; log: - mce_setup(&m); - rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); - if (!(m.status & MCI_STATUS_VAL)) - return; - m.misc = ((u64)high << 32) | low; - m.bank = bank; - mce_log(&m); - - wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); + __log_error(bank, true, ((u64)high << 32) | low); } /* diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 737737edbd1e..9208a36d0f03 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -228,7 +228,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) } } -void __init load_ucode_amd_bsp(void) +static bool __init load_builtin_amd_microcode(struct cpio_data *cp, int family) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + + if (family >= 0x15) + snprintf(fw_name, sizeof(fw_name), + "amd-ucode/microcode_amd_fam%.2xh.bin", family); + + return get_builtin_firmware(cp, fw_name); +} + +void __init load_ucode_amd_bsp(int family) { struct cpio_data cp; void **data; @@ -243,8 +254,10 @@ void __init load_ucode_amd_bsp(void) #endif cp = find_ucode_in_initrd(); - if (!cp.data) - return; + if (!cp.data) { + if (!load_builtin_amd_microcode(&cp, family)) + return; + } *data = cp.data; *size = cp.size; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 36a83617eb21..6236a54a63f4 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -1,74 +1,16 @@ /* - * Intel CPU Microcode Update Driver for Linux + * CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> - * 2006 Shaohua Li <shaohua.li@intel.com> + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * 2013-2015 Borislav Petkov <bp@alien8.de> * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. + * This driver allows to upgrade microcode on x86 processors. * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. - * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and - * Tigran Aivazian <tigran@veritas.com> - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and - * Tigran Aivazian <tigran@veritas.com>, - * Serialize updates as required on HT processors due to - * speculative nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, - * Jun Nakajima <jun.nakajima@intel.com> - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index a413a69cbd74..979ca78b1561 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -3,6 +3,7 @@ * * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> * H Peter Anvin" <hpa@zytor.com> + * (C) 2015 Borislav Petkov <bp@alien8.de> * * This driver allows to early upgrade microcode on Intel processors * belonging to IA-32 family - PentiumPro, Pentium II, @@ -17,6 +18,7 @@ * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> +#include <linux/firmware.h> #include <asm/microcode.h> #include <asm/microcode_intel.h> #include <asm/microcode_amd.h> @@ -43,6 +45,25 @@ static bool __init check_loader_disabled_bsp(void) return *res; } +extern struct builtin_fw __start_builtin_fw[]; +extern struct builtin_fw __end_builtin_fw[]; + +bool get_builtin_firmware(struct cpio_data *cd, const char *name) +{ +#ifdef CONFIG_FW_LOADER + struct builtin_fw *b_fw; + + for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { + if (!strcmp(name, b_fw->name)) { + cd->size = b_fw->size; + cd->data = b_fw->data; + return true; + } + } +#endif + return false; +} + void __init load_ucode_bsp(void) { int vendor, family; @@ -63,7 +84,7 @@ void __init load_ucode_bsp(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - load_ucode_amd_bsp(); + load_ucode_amd_bsp(family); break; default: break; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index a41beadb3db9..969dc17eb1b4 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -1,74 +1,13 @@ /* - * Intel CPU Microcode Update Driver for Linux + * Intel CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> - * 2006 Shaohua Li <shaohua.li@intel.com> + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. - * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and - * Tigran Aivazian <tigran@veritas.com> - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and - * Tigran Aivazian <tigran@veritas.com>, - * Serialize updates as required on HT processors due to - * speculative nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, - * Jun Nakajima <jun.nakajima@intel.com> - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -124,7 +63,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) cpf = cpu_sig.pf; crev = cpu_sig.rev; - return get_matching_microcode(csig, cpf, crev, mc_intel); + return has_newer_microcode(mc_intel, csig, cpf, crev); } static int apply_microcode_intel(int cpu) @@ -226,7 +165,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, csig = uci->cpu_sig.sig; cpf = uci->cpu_sig.pf; - if (get_matching_microcode(csig, cpf, new_rev, mc)) { + if (has_newer_microcode(mc, csig, cpf, new_rev)) { vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 2f49ab4ac0ae..10dff3f3f686 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -59,10 +59,10 @@ load_microcode_early(struct microcode_intel **saved, ucode_ptr = saved[i]; mc_hdr = (struct microcode_header_intel *)ucode_ptr; - ret = get_matching_microcode(uci->cpu_sig.sig, - uci->cpu_sig.pf, - new_rev, - ucode_ptr); + ret = has_newer_microcode(ucode_ptr, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + new_rev); if (!ret) continue; @@ -246,7 +246,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, unsigned int num_saved) { struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - unsigned int sig, pf, new_rev; + unsigned int sig, pf; int found = 0, i; mc_hdr = (struct microcode_header_intel *)ucode_ptr; @@ -255,14 +255,13 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; sig = mc_saved_hdr->sig; pf = mc_saved_hdr->pf; - new_rev = mc_hdr->rev; - if (!get_matching_sig(sig, pf, new_rev, ucode_ptr)) + if (!find_matching_signature(ucode_ptr, sig, pf)) continue; found = 1; - if (!revision_is_newer(mc_hdr, new_rev)) + if (mc_hdr->rev <= mc_saved_hdr->rev) continue; /* @@ -522,6 +521,23 @@ out: EXPORT_SYMBOL_GPL(save_mc_for_early); #endif +static bool __init load_builtin_intel_microcode(struct cpio_data *cp) +{ + u32 eax = 0x00000001, ebx, ecx = 0, edx; + int family, model, stepping; + char name[30]; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + family = __x86_family(eax); + model = x86_model(eax); + stepping = eax & 0xf; + + sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping); + + return get_builtin_firmware(cp, name); +} + static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, @@ -540,8 +556,10 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, cd.size = 0; cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) - return UCODE_ERROR; + if (!cd.data) { + if (!load_builtin_intel_microcode(&cd)) + return UCODE_ERROR; + } return get_matching_model_microcode(0, start, cd.data, cd.size, mc_saved_data, initrd, uci); diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index cd47a510a3f1..1883d252ff7d 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -31,11 +31,18 @@ #include <asm/processor.h> #include <asm/msr.h> -static inline int -update_match_cpu(unsigned int csig, unsigned int cpf, - unsigned int sig, unsigned int pf) +static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) { - return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; + if (s1 != s2) + return false; + + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; + + /* ... or they intersect. */ + return p1 & p2; } int microcode_sanity_check(void *mc, int print_err) @@ -124,27 +131,25 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check); /* * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc) +int find_matching_signature(void *mc, unsigned int csig, int cpf) { - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - int ext_sigcount, i; + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; struct extended_signature *ext_sig; + int i; - if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) + if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ - if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) return 0; - ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - for (i = 0; i < ext_sigcount; i++) { - if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) + for (i = 0; i < ext_hdr->count; i++) { + if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } @@ -154,13 +159,13 @@ int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc) /* * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc) +int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) { struct microcode_header_intel *mc_hdr = mc; - if (!revision_is_newer(mc_hdr, rev)) + if (mc_hdr->rev <= new_rev) return 0; - return get_matching_sig(csig, cpf, rev, mc); + return find_matching_signature(mc, csig, cpf); } -EXPORT_SYMBOL_GPL(get_matching_microcode); +EXPORT_SYMBOL_GPL(has_newer_microcode); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 939155ffdece..aad4bd84b475 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - irq_enter(); - exit_idle(); - + entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d74f7b3c6ba..5b239679cfc9 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -137,7 +137,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) idx = 1 * 8; idx += ((start - 0x80000) >> 14); return mtrr_state.fixed_ranges[idx]; - } else if (start < 0x1000000) { + } else { idx = 3 * 8; idx += ((start - 0xC0000) >> 12); return mtrr_state.fixed_ranges[idx]; diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 358c54ad20d4..5cbd4e64feb5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -204,9 +204,8 @@ again: static void rapl_start_hrtimer(struct rapl_pmu *pmu) { - __hrtimer_start_range_ns(&pmu->hrtimer, - pmu->timer_interval, 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&pmu->hrtimer, pmu->timer_interval, + HRTIMER_MODE_REL_PINNED); } static void rapl_stop_hrtimer(struct rapl_pmu *pmu) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c635b8b49e93..bf568e1ce12b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -233,9 +233,8 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { - __hrtimer_start_range_ns(&box->hrtimer, - ns_to_ktime(box->hrtimer_duration), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), + HRTIMER_MODE_REL_PINNED); } void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) @@ -922,6 +921,9 @@ static int __init uncore_pci_init(void) case 69: /* Haswell Celeron */ ret = hsw_uncore_pci_init(); break; + case 61: /* Broadwell */ + ret = bdw_uncore_pci_init(); + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 6c8c1e7e69d8..06b07930e48b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -326,6 +326,7 @@ extern struct event_constraint uncore_constraint_empty; int snb_uncore_pci_init(void); int ivb_uncore_pci_init(void); int hsw_uncore_pci_init(void); +int bdw_uncore_pci_init(void); void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index 4562e9e22c60..b005a78c7012 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -7,6 +7,7 @@ #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 +#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -486,6 +487,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = { { /* end: all zeroes */ }, }; +static const struct pci_device_id bdw_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + static struct pci_driver snb_uncore_pci_driver = { .name = "snb_uncore", .id_table = snb_uncore_pci_ids, @@ -501,6 +510,11 @@ static struct pci_driver hsw_uncore_pci_driver = { .id_table = hsw_uncore_pci_ids, }; +static struct pci_driver bdw_uncore_pci_driver = { + .name = "bdw_uncore", + .id_table = bdw_uncore_pci_ids, +}; + struct imc_uncore_pci_dev { __u32 pci_id; struct pci_driver *driver; @@ -514,6 +528,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ + IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ { /* end marker */ } }; @@ -561,6 +576,11 @@ int hsw_uncore_pci_init(void) return imc_uncore_pci_init(); } +int bdw_uncore_pci_init(void) +{ + return imc_uncore_pci_init(); +} + /* end of Sandy Bridge uncore support */ /* Nehalem uncore support */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 6367a780cc8c..5ee771859b6f 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,7 +4,6 @@ #include <linux/bootmem.h> #include <linux/export.h> #include <linux/io.h> -#include <linux/irqdomain.h> #include <linux/interrupt.h> #include <linux/list.h> #include <linux/of.h> @@ -17,6 +16,7 @@ #include <linux/of_pci.h> #include <linux/initrd.h> +#include <asm/irqdomain.h> #include <asm/hpet.h> #include <asm/apic.h> #include <asm/pci_x86.h> @@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *domain, - struct device_node *controller, - const u32 *intspec, u32 intsize, - irq_hw_number_t *out_hwirq, u32 *out_type) +static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { + struct of_phandle_args *irq_data = (void *)arg; struct of_ioapic_type *it; - u32 line, idx, gsi; + struct irq_alloc_info tmp; - if (WARN_ON(intsize < 2)) + if (WARN_ON(irq_data->args_count < 2)) return -EINVAL; - - line = intspec[0]; - - if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) + if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = &of_ioapic_type[intspec[1]]; + it = &of_ioapic_type[irq_data->args[1]]; + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic_pin = irq_data->args[0]; - idx = (u32)(long)domain->host_data; - gsi = mp_pin_to_gsi(idx, line); - if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0))) - return -EBUSY; - - *out_hwirq = line; - *out_type = it->out_type; - return 0; + return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } -const struct irq_domain_ops ioapic_irq_domain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, - .xlate = ioapic_xlate, +static const struct irq_domain_ops ioapic_irq_domain_ops = { + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init dtb_add_ioapic(struct device_node *dn) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 02c2eff7478d..0395a59f67c4 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -216,7 +216,7 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack),%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp /* Construct struct pt_regs on stack */ pushq_cfi $__USER_DS /* pt_regs->ss */ @@ -419,26 +419,27 @@ syscall_return: * a completely clean 64-bit userspace context. */ movq RCX(%rsp),%rcx - cmpq %rcx,RIP(%rsp) /* RCX == RIP */ + movq RIP(%rsp),%r11 + cmpq %rcx,%r11 /* RCX == RIP */ jne opportunistic_sysret_failed /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) + * the kernel, since userspace controls RSP. * - * If virtual addresses ever become wider, this will need + * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. */ .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed + /* Change top 16 bits to be the sign-extension of 47th bit */ + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne opportunistic_sysret_failed cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ jne opportunistic_sysret_failed @@ -475,8 +476,8 @@ syscall_return: */ syscall_return_via_sysret: CFI_REMEMBER_STATE - /* r11 is already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_R11 + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp),%rsp USERGS_SYSRET64 CFI_RESTORE_STATE @@ -533,40 +534,27 @@ GLOBAL(stub_execveat) CFI_ENDPROC END(stub_execveat) -#ifdef CONFIG_X86_X32_ABI +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) +GLOBAL(stub32_execve) CFI_STARTPROC DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve CFI_ENDPROC +END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 - call compat_sys_execveat - jmp return_from_execve - CFI_ENDPROC -END(stub_x32_execveat) -#endif - -#ifdef CONFIG_IA32_EMULATION - .align 8 -GLOBAL(stub32_execve) - CFI_STARTPROC - call compat_sys_execve - jmp return_from_execve - CFI_ENDPROC -END(stub32_execve) - .align 8 GLOBAL(stub32_execveat) CFI_STARTPROC + DEFAULT_FRAME 0, 8 call compat_sys_execveat jmp return_from_execve CFI_ENDPROC END(stub32_execveat) +END(stub_x32_execveat) #endif /* @@ -622,7 +610,7 @@ ENTRY(ret_from_fork) RESTORE_EXTRA_REGS - testl $3,CS(%rsp) # from kernel_thread? + testb $3, CS(%rsp) # from kernel_thread? /* * By the time we get here, we have no idea whether our pt_regs, @@ -686,8 +674,8 @@ END(irq_entries_start) leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - testl $3, CS-RBP(%rsp) - je 1f + testb $3, CS-RBP(%rsp) + jz 1f SWAPGS 1: /* @@ -741,8 +729,8 @@ ret_from_intr: CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP - testl $3,CS(%rsp) - je retint_kernel + testb $3, CS(%rsp) + jz retint_kernel /* Interrupt came from user space */ GET_THREAD_INFO(%rcx) @@ -928,6 +916,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR \ kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \ + kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD @@ -935,6 +925,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt #endif +#ifdef CONFIG_X86_MCE_AMD +apicinterrupt DEFERRED_ERROR_VECTOR \ + deferred_error_interrupt smp_deferred_error_interrupt +#endif + #ifdef CONFIG_X86_THERMAL_VECTOR apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt @@ -989,7 +984,7 @@ ENTRY(\sym) .if \paranoid .if \paranoid == 1 CFI_REMEMBER_STATE - testl $3, CS(%rsp) /* If coming from userspace, switch */ + testb $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif call paranoid_entry @@ -1202,17 +1197,17 @@ ENTRY(xen_failsafe_callback) /*CFI_REL_OFFSET ds,DS*/ CFI_REL_OFFSET r11,8 CFI_REL_OFFSET rcx,0 - movw %ds,%cx + movl %ds,%ecx cmpw %cx,0x10(%rsp) CFI_REMEMBER_STATE jne 1f - movw %es,%cx + movl %es,%ecx cmpw %cx,0x18(%rsp) jne 1f - movw %fs,%cx + movl %fs,%ecx cmpw %cx,0x20(%rsp) jne 1f - movw %gs,%cx + movl %gs,%ecx cmpw %cx,0x28(%rsp) jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ @@ -1330,8 +1325,8 @@ ENTRY(error_entry) SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 xorl %ebx,%ebx - testl $3,CS+8(%rsp) - je error_kernelspace + testb $3, CS+8(%rsp) + jz error_kernelspace error_swapgs: SWAPGS error_sti: @@ -1382,7 +1377,7 @@ ENTRY(error_exit) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax - jne retint_kernel + jnz retint_kernel LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi @@ -1627,7 +1622,6 @@ end_repeat_nmi: je 1f movq %r12, %cr2 1: - testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile new file mode 100644 index 000000000000..68279efb811a --- /dev/null +++ b/arch/x86/kernel/fpu/Makefile @@ -0,0 +1,5 @@ +# +# Build rules for the FPU support code: +# + +obj-y += init.o bugs.o core.o regset.o signal.o xstate.o diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c new file mode 100644 index 000000000000..dd9ca9b60ff3 --- /dev/null +++ b/arch/x86/kernel/fpu/bugs.c @@ -0,0 +1,71 @@ +/* + * x86 FPU bug checks: + */ +#include <asm/fpu/internal.h> + +/* + * Boot time CPU/FPU FDIV bug detection code: + */ + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +static void __init check_fpu(void) +{ + u32 cr0_saved; + s32 fdiv_bug; + + /* We might have CR0::TS set already, clear it: */ + cr0_saved = read_cr0(); + write_cr0(cr0_saved & ~X86_CR0_TS); + + kernel_fpu_begin(); + + /* + * trap_init() enabled FXSR and company _before_ testing for FP + * problems here. + * + * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug + */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&fdiv_bug) + : "m" (*&x), "m" (*&y)); + + kernel_fpu_end(); + + write_cr0(cr0_saved); + + if (fdiv_bug) { + set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); + pr_warn("Hmm, FPU with FDIV bug\n"); + } +} + +void __init fpu__init_check_bugs(void) +{ + /* + * kernel_fpu_begin/end() in check_fpu() relies on the patched + * alternative instructions. + */ + if (cpu_has_fpu) + check_fpu(); +} diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c new file mode 100644 index 000000000000..01a15503c3be --- /dev/null +++ b/arch/x86/kernel/fpu/core.c @@ -0,0 +1,509 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ +#include <asm/fpu/internal.h> +#include <asm/fpu/regset.h> +#include <asm/fpu/signal.h> +#include <asm/traps.h> + +#include <linux/hardirq.h> + +/* + * Represents the initial FPU state. It's mostly (but not completely) zeroes, + * depending on the FPU hardware format: + */ +union fpregs_state init_fpstate __read_mostly; + +/* + * Track whether the kernel is using the FPU state + * currently. + * + * This flag is used: + * + * - by IRQ context code to potentially use the FPU + * if it's unused. + * + * - to debug kernel_fpu_begin()/end() correctness + */ +static DEFINE_PER_CPU(bool, in_kernel_fpu); + +/* + * Track which context is using the FPU on the CPU: + */ +DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +static void kernel_fpu_disable(void) +{ + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); + this_cpu_write(in_kernel_fpu, true); +} + +static void kernel_fpu_enable(void) +{ + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + this_cpu_write(in_kernel_fpu, false); +} + +static bool kernel_fpu_disabled(void) +{ + return this_cpu_read(in_kernel_fpu); +} + +/* + * Were we in an interrupt that interrupted kernel mode? + * + * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + * + * Except for the eagerfpu case when we return true; in the likely case + * the thread has FPU but we are not going to set/clear TS. + */ +static bool interrupted_kernel_fpu_idle(void) +{ + if (kernel_fpu_disabled()) + return false; + + if (use_eager_fpu()) + return true; + + return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +void __kernel_fpu_begin(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + WARN_ON_FPU(!irq_fpu_usable()); + + kernel_fpu_disable(); + + if (fpu->fpregs_active) { + copy_fpregs_to_fpstate(fpu); + } else { + this_cpu_write(fpu_fpregs_owner_ctx, NULL); + __fpregs_activate_hw(); + } +} +EXPORT_SYMBOL(__kernel_fpu_begin); + +void __kernel_fpu_end(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + if (fpu->fpregs_active) { + if (WARN_ON_FPU(copy_fpstate_to_fpregs(fpu))) + fpu__clear(fpu); + } else { + __fpregs_deactivate_hw(); + } + + kernel_fpu_enable(); +} +EXPORT_SYMBOL(__kernel_fpu_end); + +void kernel_fpu_begin(void) +{ + preempt_disable(); + __kernel_fpu_begin(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + __kernel_fpu_end(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); + +/* + * CR0::TS save/restore functions: + */ +int irq_ts_save(void) +{ + /* + * If in process context and not atomic, we can take a spurious DNA fault. + * Otherwise, doing clts() in process context requires disabling preemption + * or some heavy lifting like kernel_fpu_begin() + */ + if (!in_atomic()) + return 0; + + if (read_cr0() & X86_CR0_TS) { + clts(); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(irq_ts_save); + +void irq_ts_restore(int TS_state) +{ + if (TS_state) + stts(); +} +EXPORT_SYMBOL_GPL(irq_ts_restore); + +/* + * Save the FPU state (mark it for reload if necessary): + * + * This only ever gets called for the current task. + */ +void fpu__save(struct fpu *fpu) +{ + WARN_ON_FPU(fpu != ¤t->thread.fpu); + + preempt_disable(); + if (fpu->fpregs_active) { + if (!copy_fpregs_to_fpstate(fpu)) + fpregs_deactivate(fpu); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(fpu__save); + +/* + * Legacy x87 fpstate state init: + */ +static inline void fpstate_init_fstate(struct fregs_state *fp) +{ + fp->cwd = 0xffff037fu; + fp->swd = 0xffff0000u; + fp->twd = 0xffffffffu; + fp->fos = 0xffff0000u; +} + +void fpstate_init(union fpregs_state *state) +{ + if (!cpu_has_fpu) { + fpstate_init_soft(&state->soft); + return; + } + + memset(state, 0, xstate_size); + + if (cpu_has_fxsr) + fpstate_init_fxstate(&state->fxsave); + else + fpstate_init_fstate(&state->fsave); +} +EXPORT_SYMBOL_GPL(fpstate_init); + +/* + * Copy the current task's FPU state to a new task's FPU context. + * + * In both the 'eager' and the 'lazy' case we save hardware registers + * directly to the destination buffer. + */ +static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) +{ + WARN_ON_FPU(src_fpu != ¤t->thread.fpu); + + /* + * Don't let 'init optimized' areas of the XSAVE area + * leak into the child task: + */ + if (use_eager_fpu()) + memset(&dst_fpu->state.xsave, 0, xstate_size); + + /* + * Save current FPU registers directly into the child + * FPU context, without any memory-to-memory copying. + * + * If the FPU context got destroyed in the process (FNSAVE + * done on old CPUs) then copy it back into the source + * context and mark the current task for lazy restore. + * + * We have to do all this with preemption disabled, + * mostly because of the FNSAVE case, because in that + * case we must not allow preemption in the window + * between the FNSAVE and us marking the context lazy. + * + * It shouldn't be an issue as even FNSAVE is plenty + * fast in terms of critical section length. + */ + preempt_disable(); + if (!copy_fpregs_to_fpstate(dst_fpu)) { + memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); + fpregs_deactivate(src_fpu); + } + preempt_enable(); +} + +int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) +{ + dst_fpu->counter = 0; + dst_fpu->fpregs_active = 0; + dst_fpu->last_cpu = -1; + + if (src_fpu->fpstate_active) + fpu_copy(dst_fpu, src_fpu); + + return 0; +} + +/* + * Activate the current task's in-memory FPU context, + * if it has not been used before: + */ +void fpu__activate_curr(struct fpu *fpu) +{ + WARN_ON_FPU(fpu != ¤t->thread.fpu); + + if (!fpu->fpstate_active) { + fpstate_init(&fpu->state); + + /* Safe to do for the current task: */ + fpu->fpstate_active = 1; + } +} +EXPORT_SYMBOL_GPL(fpu__activate_curr); + +/* + * This function must be called before we modify a stopped child's + * fpstate. + * + * If the child has not used the FPU before then initialize its + * fpstate. + * + * If the child has used the FPU before then unlazy it. + * + * [ After this function call, after registers in the fpstate are + * modified and the child task has woken up, the child task will + * restore the modified FPU state from the modified context. If we + * didn't clear its lazy status here then the lazy in-registers + * state pending on its former CPU could be restored, corrupting + * the modifications. ] + * + * This function is also called before we read a stopped child's + * FPU state - to make sure it's initialized if the child has + * no active FPU state. + * + * TODO: A future optimization would be to skip the unlazying in + * the read-only case, it's not strictly necessary for + * read-only access to the context. + */ +void fpu__activate_stopped(struct fpu *child_fpu) +{ + WARN_ON_FPU(child_fpu == ¤t->thread.fpu); + + if (child_fpu->fpstate_active) { + child_fpu->last_cpu = -1; + } else { + fpstate_init(&child_fpu->state); + + /* Safe to do for stopped child tasks: */ + child_fpu->fpstate_active = 1; + } +} + +/* + * 'fpu__restore()' is called to copy FPU registers from + * the FPU fpstate to the live hw registers and to activate + * access to the hardware registers, so that FPU instructions + * can be used afterwards. + * + * Must be called with kernel preemption disabled (for example + * with local interrupts disabled, as it is in the case of + * do_device_not_available()). + */ +void fpu__restore(struct fpu *fpu) +{ + fpu__activate_curr(fpu); + + /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ + kernel_fpu_disable(); + fpregs_activate(fpu); + if (unlikely(copy_fpstate_to_fpregs(fpu))) { + fpu__clear(fpu); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, current); + } else { + fpu->counter++; + } + kernel_fpu_enable(); +} +EXPORT_SYMBOL_GPL(fpu__restore); + +/* + * Drops current FPU state: deactivates the fpregs and + * the fpstate. NOTE: it still leaves previous contents + * in the fpregs in the eager-FPU case. + * + * This function can be used in cases where we know that + * a state-restore is coming: either an explicit one, + * or a reschedule. + */ +void fpu__drop(struct fpu *fpu) +{ + preempt_disable(); + fpu->counter = 0; + + if (fpu->fpregs_active) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); + } + + fpu->fpstate_active = 0; + + preempt_enable(); +} + +/* + * Clear FPU registers by setting them up from + * the init fpstate: + */ +static inline void copy_init_fpstate_to_fpregs(void) +{ + if (use_xsave()) + copy_kernel_to_xregs(&init_fpstate.xsave, -1); + else + copy_kernel_to_fxregs(&init_fpstate.fxsave); +} + +/* + * Clear the FPU state back to init state. + * + * Called by sys_execve(), by the signal handler code and by various + * error paths. + */ +void fpu__clear(struct fpu *fpu) +{ + WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ + + if (!use_eager_fpu()) { + /* FPU state will be reallocated lazily at the first use. */ + fpu__drop(fpu); + } else { + if (!fpu->fpstate_active) { + fpu__activate_curr(fpu); + user_fpu_begin(); + } + copy_init_fpstate_to_fpregs(); + } +} + +/* + * x87 math exception handling: + */ + +static inline unsigned short get_fpu_cwd(struct fpu *fpu) +{ + if (cpu_has_fxsr) { + return fpu->state.fxsave.cwd; + } else { + return (unsigned short)fpu->state.fsave.cwd; + } +} + +static inline unsigned short get_fpu_swd(struct fpu *fpu) +{ + if (cpu_has_fxsr) { + return fpu->state.fxsave.swd; + } else { + return (unsigned short)fpu->state.fsave.swd; + } +} + +static inline unsigned short get_fpu_mxcsr(struct fpu *fpu) +{ + if (cpu_has_xmm) { + return fpu->state.fxsave.mxcsr; + } else { + return MXCSR_DEFAULT; + } +} + +int fpu__exception_code(struct fpu *fpu, int trap_nr) +{ + int err; + + if (trap_nr == X86_TRAP_MF) { + unsigned short cwd, swd; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(fpu); + swd = get_fpu_swd(fpu); + + err = swd & ~cwd; + } else { + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + unsigned short mxcsr = get_fpu_mxcsr(fpu); + err = ~(mxcsr >> 7) & mxcsr; + } + + if (err & 0x001) { /* Invalid op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + return FPE_FLTINV; + } else if (err & 0x004) { /* Divide by Zero */ + return FPE_FLTDIV; + } else if (err & 0x008) { /* Overflow */ + return FPE_FLTOVF; + } else if (err & 0x012) { /* Denormal, Underflow */ + return FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ + return FPE_FLTRES; + } + + /* + * If we're using IRQ 13, or supposedly even some trap + * X86_TRAP_MF implementations, it's possible + * we get a spurious trap, which is not an error. + */ + return 0; +} diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c new file mode 100644 index 000000000000..fc878fee6a51 --- /dev/null +++ b/arch/x86/kernel/fpu/init.c @@ -0,0 +1,354 @@ +/* + * x86 FPU boot time init code: + */ +#include <asm/fpu/internal.h> +#include <asm/tlbflush.h> + +/* + * Initialize the TS bit in CR0 according to the style of context-switches + * we are using: + */ +static void fpu__init_cpu_ctx_switch(void) +{ + if (!cpu_has_eager_fpu) + stts(); + else + clts(); +} + +/* + * Initialize the registers found in all CPUs, CR0 and CR4: + */ +static void fpu__init_cpu_generic(void) +{ + unsigned long cr0; + unsigned long cr4_mask = 0; + + if (cpu_has_fxsr) + cr4_mask |= X86_CR4_OSFXSR; + if (cpu_has_xmm) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + cr4_set_bits(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!cpu_has_fpu) + cr0 |= X86_CR0_EM; + write_cr0(cr0); + + /* Flush out any pending x87 state: */ + asm volatile ("fninit"); +} + +/* + * Enable all supported FPU features. Called when a CPU is brought online: + */ +void fpu__init_cpu(void) +{ + fpu__init_cpu_generic(); + fpu__init_cpu_xstate(); + fpu__init_cpu_ctx_switch(); +} + +/* + * The earliest FPU detection code. + * + * Set the X86_FEATURE_FPU CPU-capability bit based on + * trying to execute an actual sequence of FPU instructions: + */ +static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +{ + unsigned long cr0; + u16 fsw, fcw; + + fsw = fcw = 0xffff; + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS | X86_CR0_EM); + write_cr0(cr0); + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); + + if (fsw == 0 && (fcw & 0x103f) == 0x003f) + set_cpu_cap(c, X86_FEATURE_FPU); + else + clear_cpu_cap(c, X86_FEATURE_FPU); + +#ifndef CONFIG_MATH_EMULATION + if (!cpu_has_fpu) { + pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); + for (;;) + asm volatile("hlt"); + } +#endif +} + +/* + * Boot time FPU feature detection code: + */ +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; + +static void __init fpu__init_system_mxcsr(void) +{ + unsigned int mask = 0; + + if (cpu_has_fxsr) { + struct fxregs_state fx_tmp __aligned(32) = { }; + + asm volatile("fxsave %0" : "+m" (fx_tmp)); + + mask = fx_tmp.mxcsr_mask; + + /* + * If zero then use the default features mask, + * which has all features set, except the + * denormals-are-zero feature bit: + */ + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; +} + +/* + * Once per bootup FPU initialization sequences that will run on most x86 CPUs: + */ +static void __init fpu__init_system_generic(void) +{ + /* + * Set up the legacy init FPU context. (xstate init might overwrite this + * with a more modern format, if the CPU supports it.) + */ + fpstate_init_fxstate(&init_fpstate.fxsave); + + fpu__init_system_mxcsr(); +} + +/* + * Size of the FPU context state. All tasks in the system use the + * same context size, regardless of what portion they use. + * This is inherent to the XSAVE architecture which puts all state + * components into a single, continuous memory block: + */ +unsigned int xstate_size; +EXPORT_SYMBOL_GPL(xstate_size); + +/* + * Set up the xstate_size based on the legacy FPU context size. + * + * We set this up first, and later it will be overwritten by + * fpu__init_system_xstate() if the CPU knows about xstates. + */ +static void __init fpu__init_system_xstate_size_legacy(void) +{ + static int on_boot_cpu = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + + /* + * Note that xstate_size might be overwriten later during + * fpu__init_system_xstate(). + */ + + if (!cpu_has_fpu) { + /* + * Disable xsave as we do not support it if i387 + * emulation is enabled. + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + xstate_size = sizeof(struct swregs_state); + } else { + if (cpu_has_fxsr) + xstate_size = sizeof(struct fxregs_state); + else + xstate_size = sizeof(struct fregs_state); + } + /* + * Quirk: we don't yet handle the XSAVES* instructions + * correctly, as we don't correctly convert between + * standard and compacted format when interfacing + * with user-space - so disable it for now. + * + * The difference is small: with recent CPUs the + * compacted format is only marginally smaller than + * the standard FPU state format. + * + * ( This is easy to backport while we are fixing + * XSAVES* support. ) + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); +} + +/* + * FPU context switching strategies: + * + * Against popular belief, we don't do lazy FPU saves, due to the + * task migration complications it brings on SMP - we only do + * lazy FPU restores. + * + * 'lazy' is the traditional strategy, which is based on setting + * CR0::TS to 1 during context-switch (instead of doing a full + * restore of the FPU state), which causes the first FPU instruction + * after the context switch (whenever it is executed) to fault - at + * which point we lazily restore the FPU state into FPU registers. + * + * Tasks are of course under no obligation to execute FPU instructions, + * so it can easily happen that another context-switch occurs without + * a single FPU instruction being executed. If we eventually switch + * back to the original task (that still owns the FPU) then we have + * not only saved the restores along the way, but we also have the + * FPU ready to be used for the original task. + * + * 'eager' switching is used on modern CPUs, there we switch the FPU + * state during every context switch, regardless of whether the task + * has used FPU instructions in that time slice or not. This is done + * because modern FPU context saving instructions are able to optimize + * state saving and restoration in hardware: they can detect both + * unused and untouched FPU state and optimize accordingly. + * + * [ Note that even in 'lazy' mode we might optimize context switches + * to use 'eager' restores, if we detect that a task is using the FPU + * frequently. See the fpu->counter logic in fpu/internal.h for that. ] + */ +static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; + +static int __init eager_fpu_setup(char *s) +{ + if (!strcmp(s, "on")) + eagerfpu = ENABLE; + else if (!strcmp(s, "off")) + eagerfpu = DISABLE; + else if (!strcmp(s, "auto")) + eagerfpu = AUTO; + return 1; +} +__setup("eagerfpu=", eager_fpu_setup); + +/* + * Pick the FPU context switching strategy: + */ +static void __init fpu__init_system_ctx_switch(void) +{ + static bool on_boot_cpu = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + + WARN_ON_FPU(current->thread.fpu.fpstate_active); + current_thread_info()->status = 0; + + /* Auto enable eagerfpu for xsaveopt */ + if (cpu_has_xsaveopt && eagerfpu != DISABLE) + eagerfpu = ENABLE; + + if (xfeatures_mask & XSTATE_EAGER) { + if (eagerfpu == DISABLE) { + pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", + xfeatures_mask & XSTATE_EAGER); + xfeatures_mask &= ~XSTATE_EAGER; + } else { + eagerfpu = ENABLE; + } + } + + if (eagerfpu == ENABLE) + setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + + printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); +} + +/* + * Called on the boot CPU once per system bootup, to set up the initial + * FPU state that is later cloned into all processes: + */ +void __init fpu__init_system(struct cpuinfo_x86 *c) +{ + fpu__init_system_early_generic(c); + + /* + * The FPU has to be operational for some of the + * later FPU init activities: + */ + fpu__init_cpu(); + + /* + * But don't leave CR0::TS set yet, as some of the FPU setup + * methods depend on being able to execute FPU instructions + * that will fault on a set TS, such as the FXSAVE in + * fpu__init_system_mxcsr(). + */ + clts(); + + fpu__init_system_generic(); + fpu__init_system_xstate_size_legacy(); + fpu__init_system_xstate(); + + fpu__init_system_ctx_switch(); +} + +/* + * Boot parameter to turn off FPU support and fall back to math-emu: + */ +static int __init no_387(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FPU); + return 1; +} +__setup("no387", no_387); + +/* + * Disable all xstate CPU features: + */ +static int __init x86_noxsave_setup(char *s) +{ + if (strlen(s)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); + + return 1; +} +__setup("noxsave", x86_noxsave_setup); + +/* + * Disable the XSAVEOPT instruction specifically: + */ +static int __init x86_noxsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + return 1; +} +__setup("noxsaveopt", x86_noxsaveopt_setup); + +/* + * Disable the XSAVES instruction: + */ +static int __init x86_noxsaves_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + return 1; +} +__setup("noxsaves", x86_noxsaves_setup); + +/* + * Disable FX save/restore and SSE support: + */ +static int __init x86_nofxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); + setup_clear_cpu_cap(X86_FEATURE_XMM); + + return 1; +} +__setup("nofxsr", x86_nofxsr_setup); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c new file mode 100644 index 000000000000..297b3da8e4c4 --- /dev/null +++ b/arch/x86/kernel/fpu/regset.c @@ -0,0 +1,356 @@ +/* + * FPU register's regset abstraction, for ptrace, core dumps, etc. + */ +#include <asm/fpu/internal.h> +#include <asm/fpu/signal.h> +#include <asm/fpu/regset.h> + +/* + * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ +int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + struct fpu *target_fpu = &target->thread.fpu; + + return target_fpu->fpstate_active ? regset->n : 0; +} + +int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + struct fpu *target_fpu = &target->thread.fpu; + + return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0; +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + + if (!cpu_has_fxsr) + return -ENODEV; + + fpu__activate_stopped(fpu); + fpstate_sanitize_xstate(fpu); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &fpu->state.fxsave, 0, -1); +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + fpu__activate_stopped(fpu); + fpstate_sanitize_xstate(fpu); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fpu->state.fxsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; + + /* + * update the header bits in the xsave header, indicating the + * presence of FP and SSE state. + */ + if (cpu_has_xsave) + fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE; + + return ret; +} + +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave; + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + fpu__activate_stopped(fpu); + + xsave = &fpu->state.xsave; + + /* + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). + */ + memcpy(&xsave->i387.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave; + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + fpu__activate_stopped(fpu); + + xsave = &fpu->state.xsave; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + xsave->header.xfeatures &= xfeatures_mask; + /* + * These bits must be zero. + */ + memset(&xsave->header.reserved, 0, 48); + + return ret; +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +void +convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +{ + struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; + if (tsk == current) { + savesegment(ds, env->fos); + } else { + env->fos = tsk->thread.ds; + } + env->fos |= 0xffff0000; +#else + env->fip = fxsave->fip; + env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) + +{ + struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct user_i387_ia32_struct env; + + fpu__activate_stopped(fpu); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + + if (!cpu_has_fxsr) + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &fpu->state.fsave, 0, + -1); + + fpstate_sanitize_xstate(fpu); + + if (kbuf && pos == 0 && count == sizeof(env)) { + convert_from_fxsr(kbuf, target); + return 0; + } + + convert_from_fxsr(&env, target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct user_i387_ia32_struct env; + int ret; + + fpu__activate_stopped(fpu); + fpstate_sanitize_xstate(fpu); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + if (!cpu_has_fxsr) + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fpu->state.fsave, 0, + -1); + + if (pos > 0 || count < sizeof(env)) + convert_from_fxsr(&env, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (!ret) + convert_to_fxsr(target, &env); + + /* + * update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (cpu_has_xsave) + fpu->state.xsave.header.xfeatures |= XSTATE_FP; + return ret; +} + +/* + * FPU state for core dumps. + * This is only used for a.out dumps now. + * It is declared generically using elf_fpregset_t (which is + * struct user_i387_struct) but is in fact only used for 32-bit + * dumps, so on 64-bit it is really struct user_i387_ia32_struct. + */ +int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) +{ + struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; + int fpvalid; + + fpvalid = fpu->fpstate_active; + if (fpvalid) + fpvalid = !fpregs_get(tsk, NULL, + 0, sizeof(struct user_i387_ia32_struct), + ufpu, NULL); + + return fpvalid; +} +EXPORT_SYMBOL(dump_fpu); + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c new file mode 100644 index 000000000000..50ec9af1bd51 --- /dev/null +++ b/arch/x86/kernel/fpu/signal.c @@ -0,0 +1,404 @@ +/* + * FPU signal frame handling routines. + */ + +#include <linux/compat.h> +#include <linux/cpu.h> + +#include <asm/fpu/internal.h> +#include <asm/fpu/signal.h> +#include <asm/fpu/regset.h> + +#include <asm/sigframe.h> + +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +static inline int check_for_xstate(struct fxregs_state __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw) +{ + int min_xstate_size = sizeof(struct fxregs_state) + + sizeof(struct xstate_header); + unsigned int magic2; + + if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) + return -1; + + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fx_sw->extended_size) + return -1; + + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) + || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + +/* + * Signal frame handlers. + */ +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +{ + if (use_fxsr()) { + struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct user_i387_ia32_struct env; + struct _fpstate_ia32 __user *fp = buf; + + convert_from_fxsr(&env, tsk); + + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return -1; + } else { + struct fregs_state __user *fp = buf; + u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) + return -1; + } + + return 0; +} + +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +{ + struct xregs_state __user *x = buf; + struct _fpx_sw_bytes *sw_bytes; + u32 xfeatures; + int err; + + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; + err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + + if (!use_xsave()) + return err; + + err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + + /* + * Read the xfeatures which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xfeatures in the xsave header. + * + * xsave aware apps can change the xfeatures in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xfeatures |= XSTATE_FPSSE; + + err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); + + return err; +} + +static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) +{ + int err; + + if (use_xsave()) + err = copy_xregs_to_user(buf); + else if (use_fxsr()) + err = copy_fxregs_to_user((struct fxregs_state __user *) buf); + else + err = copy_fregs_to_user((struct fregs_state __user *) buf); + + if (unlikely(err) && __clear_user(buf, xstate_size)) + err = -EFAULT; + return err; +} + +/* + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * If the fpu, extended register state is live, save the state directly + * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, + * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. + */ +int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +{ + struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; + struct task_struct *tsk = current; + int ia32_fxstate = (buf != buf_fx); + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!access_ok(VERIFY_WRITE, buf, size)) + return -EACCES; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), NULL, + (struct _fpstate_ia32 __user *) buf) ? -1 : 1; + + if (fpregs_active()) { + /* Save the live register state to the user directly. */ + if (copy_fpregs_to_sigframe(buf_fx)) + return -1; + /* Update the thread's fxstate to save the fsave header. */ + if (ia32_fxstate) + copy_fxregs_to_kernel(&tsk->thread.fpu); + } else { + fpstate_sanitize_xstate(&tsk->thread.fpu); + if (__copy_to_user(buf_fx, xsave, xstate_size)) + return -1; + } + + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + return -1; + + if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + return -1; + + return 0; +} + +static inline void +sanitize_restored_xstate(struct task_struct *tsk, + struct user_i387_ia32_struct *ia32_env, + u64 xfeatures, int fx_only) +{ + struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xstate_header *header = &xsave->header; + + if (use_xsave()) { + /* These bits must be zero. */ + memset(header->reserved, 0, 48); + + /* + * Init the state that is not present in the memory + * layout and not enabled by the OS. + */ + if (fx_only) + header->xfeatures = XSTATE_FPSSE; + else + header->xfeatures &= (xfeatures_mask & xfeatures); + } + + if (use_fxsr()) { + /* + * mscsr reserved bits must be masked to zero for security + * reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + + convert_to_fxsr(tsk, ia32_env); + } +} + +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE state. + */ +static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) +{ + if (use_xsave()) { + if ((unsigned long)buf % 64 || fx_only) { + u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return copy_user_to_fxregs(buf); + } else { + u64 init_bv = xfeatures_mask & ~xbv; + if (unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return copy_user_to_xregs(buf, xbv); + } + } else if (use_fxsr()) { + return copy_user_to_fxregs(buf); + } else + return copy_user_to_fregs(buf); +} + +static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) +{ + int ia32_fxstate = (buf != buf_fx); + struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; + int state_size = xstate_size; + u64 xfeatures = 0; + int fx_only = 0; + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!buf) { + fpu__clear(fpu); + return 0; + } + + if (!access_ok(VERIFY_READ, buf, size)) + return -EACCES; + + fpu__activate_curr(fpu); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { + /* + * Couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + state_size = sizeof(struct fxregs_state); + fx_only = 1; + } else { + state_size = fx_sw_user.xstate_size; + xfeatures = fx_sw_user.xfeatures; + } + } + + if (ia32_fxstate) { + /* + * For 32-bit frames with fxstate, copy the user state to the + * thread's fpu state, reconstruct fxstate from the fsave + * header. Sanitize the copied state etc. + */ + struct fpu *fpu = &tsk->thread.fpu; + struct user_i387_ia32_struct env; + int err = 0; + + /* + * Drop the current fpu which clears fpu->fpstate_active. This ensures + * that any context-switch during the copy of the new state, + * avoids the intermediate state from getting restored/saved. + * Thus avoiding the new restored state from getting corrupted. + * We will be ready to restore/save the state only after + * fpu->fpstate_active is again set. + */ + fpu__drop(fpu); + + if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || + __copy_from_user(&env, buf, sizeof(env))) { + fpstate_init(&fpu->state); + err = -1; + } else { + sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); + } + + fpu->fpstate_active = 1; + if (use_eager_fpu()) { + preempt_disable(); + fpu__restore(fpu); + preempt_enable(); + } + + return err; + } else { + /* + * For 64-bit frames and 32-bit fsave frames, restore the user + * state to the registers directly (with exceptions handled). + */ + user_fpu_begin(); + if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { + fpu__clear(fpu); + return -1; + } + } + + return 0; +} + +static inline int xstate_sigframe_size(void) +{ + return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; +} + +/* + * Restore FPU state from a sigframe: + */ +int fpu__restore_sig(void __user *buf, int ia32_frame) +{ + void __user *buf_fx = buf; + int size = xstate_sigframe_size(); + + if (ia32_frame && use_fxsr()) { + buf_fx = buf + sizeof(struct fregs_state); + size += sizeof(struct fregs_state); + } + + return __fpu__restore_sig(buf, buf_fx, size); +} + +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size) +{ + unsigned long frame_size = xstate_sigframe_size(); + + *buf_fx = sp = round_down(sp - frame_size, 64); + if (ia32_frame && use_fxsr()) { + frame_size += sizeof(struct fregs_state); + sp -= sizeof(struct fregs_state); + } + + *size = frame_size; + + return sp; +} +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +void fpu__init_prepare_fx_sw_frame(void) +{ + int fsave_header_size = sizeof(struct fregs_state); + int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + + if (config_enabled(CONFIG_X86_32)) + size += fsave_header_size; + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = size; + fx_sw_reserved.xfeatures = xfeatures_mask; + fx_sw_reserved.xstate_size = xstate_size; + + if (config_enabled(CONFIG_IA32_EMULATION)) { + fx_sw_reserved_ia32 = fx_sw_reserved; + fx_sw_reserved_ia32.extended_size += fsave_header_size; + } +} + diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c new file mode 100644 index 000000000000..ad4dd26ebd59 --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.c @@ -0,0 +1,400 @@ +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha <suresh.b.siddha@intel.com> + */ +#include <linux/compat.h> +#include <linux/cpu.h> + +#include <asm/fpu/api.h> +#include <asm/fpu/internal.h> +#include <asm/fpu/signal.h> +#include <asm/fpu/regset.h> + +#include <asm/tlbflush.h> + +static const char *xfeature_names[] = +{ + "x87 floating point registers" , + "SSE registers" , + "AVX registers" , + "MPX bounds registers" , + "MPX CSR" , + "AVX-512 opmask" , + "AVX-512 Hi256" , + "AVX-512 ZMM_Hi256" , + "unknown xstate feature" , +}; + +/* + * Mask of xstate features supported by the CPU and the kernel: + */ +u64 xfeatures_mask __read_mostly; + +static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX]; +static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; + +/* The number of supported xfeatures in xfeatures_mask: */ +static unsigned int xfeatures_nr; + +/* + * Return whether the system supports a given xfeature. + * + * Also return the name of the (most advanced) feature that the caller requested: + */ +int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) +{ + u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask; + + if (unlikely(feature_name)) { + long xfeature_idx, max_idx; + u64 xfeatures_print; + /* + * So we use FLS here to be able to print the most advanced + * feature that was requested but is missing. So if a driver + * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the + * missing AVX feature - this is the most informative message + * to users: + */ + if (xfeatures_missing) + xfeatures_print = xfeatures_missing; + else + xfeatures_print = xfeatures_needed; + + xfeature_idx = fls64(xfeatures_print)-1; + max_idx = ARRAY_SIZE(xfeature_names)-1; + xfeature_idx = min(xfeature_idx, max_idx); + + *feature_name = xfeature_names[xfeature_idx]; + } + + if (xfeatures_missing) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(cpu_has_xfeatures); + +/* + * When executing XSAVEOPT (or other optimized XSAVE instructions), if + * a processor implementation detects that an FPU state component is still + * (or is again) in its initialized state, it may clear the corresponding + * bit in the header.xfeatures field, and can skip the writeout of registers + * to the corresponding memory layout. + * + * This means that when the bit is zero, the state component might still contain + * some previous - non-initialized register state. + * + * Before writing xstate information to user-space we sanitize those components, + * to always ensure that the memory layout of a feature will be in the init state + * if the corresponding header bit is zero. This is to ensure that user-space doesn't + * see some stale state in the memory layout during signal handling, debugging etc. + */ +void fpstate_sanitize_xstate(struct fpu *fpu) +{ + struct fxregs_state *fx = &fpu->state.fxsave; + int feature_bit; + u64 xfeatures; + + if (!use_xsaveopt()) + return; + + xfeatures = fpu->state.xsave.header.xfeatures; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is up to date. + */ + if ((xfeatures & xfeatures_mask) == xfeatures_mask) + return; + + /* + * FP is in init state + */ + if (!(xfeatures & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xfeatures & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + /* + * First two features are FPU and SSE, which above we handled + * in a special way already: + */ + feature_bit = 0x2; + xfeatures = (xfeatures_mask & ~xfeatures) >> 2; + + /* + * Update all the remaining memory layouts according to their + * standard xstate layout, if their header bit is in the init + * state: + */ + while (xfeatures) { + if (xfeatures & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy((void *)fx + offset, + (void *)&init_fpstate.xsave + offset, + size); + } + + xfeatures >>= 1; + feature_bit++; + } +} + +/* + * Enable the extended processor state save/restore feature. + * Called once per CPU onlining. + */ +void fpu__init_cpu_xstate(void) +{ + if (!cpu_has_xsave || !xfeatures_mask) + return; + + cr4_set_bits(X86_CR4_OSXSAVE); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); +} + +/* + * Record the offsets and sizes of various xstates contained + * in the XSAVE state memory layout. + * + * ( Note that certain features might be non-present, for them + * we'll have 0 offset and 0 size. ) + */ +static void __init setup_xstate_features(void) +{ + u32 eax, ebx, ecx, edx, leaf; + + xfeatures_nr = fls64(xfeatures_mask); + + for (leaf = 2; leaf < xfeatures_nr; leaf++) { + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax); + leaf++; + } +} + +static void __init print_xstate_feature(u64 xstate_mask) +{ + const char *feature_name; + + if (cpu_has_xfeatures(xstate_mask, &feature_name)) + pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name); +} + +/* + * Print out all the supported xstate features: + */ +static void __init print_xstate_features(void) +{ + print_xstate_feature(XSTATE_FP); + print_xstate_feature(XSTATE_SSE); + print_xstate_feature(XSTATE_YMM); + print_xstate_feature(XSTATE_BNDREGS); + print_xstate_feature(XSTATE_BNDCSR); + print_xstate_feature(XSTATE_OPMASK); + print_xstate_feature(XSTATE_ZMM_Hi256); + print_xstate_feature(XSTATE_Hi16_ZMM); +} + +/* + * This function sets up offsets and sizes of all extended states in + * xsave area. This supports both standard format and compacted format + * of the xsave aread. + */ +static void __init setup_xstate_comp(void) +{ + unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; + int i; + + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_comp_offsets[0] = 0; + xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); + + if (!cpu_has_xsaves) { + for (i = 2; i < xfeatures_nr; i++) { + if (test_bit(i, (unsigned long *)&xfeatures_mask)) { + xstate_comp_offsets[i] = xstate_offsets[i]; + xstate_comp_sizes[i] = xstate_sizes[i]; + } + } + return; + } + + xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = 2; i < xfeatures_nr; i++) { + if (test_bit(i, (unsigned long *)&xfeatures_mask)) + xstate_comp_sizes[i] = xstate_sizes[i]; + else + xstate_comp_sizes[i] = 0; + + if (i > 2) + xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + + xstate_comp_sizes[i-1]; + + } +} + +/* + * setup the xstate image representing the init state + */ +static void __init setup_init_fpu_buf(void) +{ + static int on_boot_cpu = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + + if (!cpu_has_xsave) + return; + + setup_xstate_features(); + print_xstate_features(); + + if (cpu_has_xsaves) { + init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; + init_fpstate.xsave.header.xfeatures = xfeatures_mask; + } + + /* + * Init all the features state with header_bv being 0x0 + */ + copy_kernel_to_xregs_booting(&init_fpstate.xsave, -1); + + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + copy_xregs_to_kernel_booting(&init_fpstate.xsave); +} + +/* + * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + */ +static void __init init_xstate_size(void) +{ + unsigned int eax, ebx, ecx, edx; + int i; + + if (!cpu_has_xsaves) { + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xstate_size = ebx; + return; + } + + xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for (i = 2; i < 64; i++) { + if (test_bit(i, (unsigned long *)&xfeatures_mask)) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_size += eax; + } + } +} + +/* + * Enable and initialize the xsave feature. + * Called once per system bootup. + */ +void __init fpu__init_system_xstate(void) +{ + unsigned int eax, ebx, ecx, edx; + static int on_boot_cpu = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + + if (!cpu_has_xsave) { + pr_info("x86/fpu: Legacy x87 FPU detected.\n"); + return; + } + + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN_ON_FPU(1); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xfeatures_mask = eax + ((u64)edx << 32); + + if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); + BUG(); + } + + /* Support only the state known to the OS: */ + xfeatures_mask = xfeatures_mask & XCNTXT_MASK; + + /* Enable xstate instructions to be able to continue with initialization: */ + fpu__init_cpu_xstate(); + + /* Recompute the context size for enabled features: */ + init_xstate_size(); + + update_regset_xstate_info(xstate_size, xfeatures_mask); + fpu__init_prepare_fx_sw_frame(); + setup_init_fpu_buf(); + setup_xstate_comp(); + + pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", + xfeatures_mask, + xstate_size, + cpu_has_xsaves ? "compacted" : "standard"); +} + +/* + * Restore minimal FPU state after suspend: + */ +void fpu__resume_cpu(void) +{ + /* + * Restore XCR0 on xsave capable CPUs: + */ + if (cpu_has_xsave) + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); +} + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Inputs: + * xsave: base address of the xsave area; + * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, + * etc.) + * Output: + * address of the state in the xsave area. + */ +void *get_xsave_addr(struct xregs_state *xsave, int xstate) +{ + int feature = fls64(xstate) - 1; + if (!test_bit(feature, (unsigned long *)&xfeatures_mask)) + return NULL; + + return (void *)xsave + xstate_comp_offsets[feature]; +} +EXPORT_SYMBOL_GPL(get_xsave_addr); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2b55ee6db053..5a4668136e98 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d031bad9e07e..544dec4cc605 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -478,21 +478,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -524,30 +525,32 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,%ss:early_recursion_flag je hlt_loop @@ -600,10 +603,10 @@ ex_entry: pop %ecx pop %eax decl %ss:early_recursion_flag -is_nmi: +.Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ae6588b301c2..e5c27f729a38 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,30 +321,32 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,early_recursion_flag(%rip) jz 1f @@ -409,10 +411,10 @@ ENTRY(early_idt_handler) popq %rcx popq %rax decl early_recursion_flag(%rip) -is_nmi: +.Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3acbff4716b0..e2449cf38b06 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -12,6 +12,7 @@ #include <linux/pm.h> #include <linux/io.h> +#include <asm/irqdomain.h> #include <asm/fixmap.h> #include <asm/hpet.h> #include <asm/time.h> @@ -305,8 +306,6 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static int hpet_setup_msi_irq(unsigned int irq); - static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { @@ -357,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode, hpet_enable_legacy_int(); } else { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - hpet_setup_msi_irq(hdev->irq); + irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); disable_irq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); @@ -423,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta, static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); static struct hpet_dev *hpet_devs; +static struct irq_domain *hpet_domain; void hpet_msi_unmask(struct irq_data *data) { @@ -473,31 +473,6 @@ static int hpet_msi_next_event(unsigned long delta, return hpet_next_event(delta, evt, hdev->num); } -static int hpet_setup_msi_irq(unsigned int irq) -{ - if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - irq_free_hwirq(irq); - return -EINVAL; - } - return 0; -} - -static int hpet_assign_irq(struct hpet_dev *dev) -{ - unsigned int irq = irq_alloc_hwirq(-1); - - if (!irq) - return -EINVAL; - - irq_set_handler_data(irq, dev); - - if (hpet_setup_msi_irq(irq)) - return -EINVAL; - - dev->irq = irq; - return 0; -} - static irqreturn_t hpet_interrupt_handler(int irq, void *data) { struct hpet_dev *dev = (struct hpet_dev *)data; @@ -540,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) if (!(hdev->flags & HPET_DEV_VALID)) return; - if (hpet_setup_msi_irq(hdev->irq)) - return; - hdev->cpu = cpu; per_cpu(cpu_hpet_dev, cpu) = hdev; evt->name = hdev->name; @@ -574,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int id; unsigned int num_timers; unsigned int num_timers_used = 0; - int i; + int i, irq; if (hpet_msi_disable) return; @@ -587,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) num_timers++; /* Value read out starts from 0 */ hpet_print_config(); + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); if (!hpet_devs) return; @@ -601,15 +577,16 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) if (!(cfg & HPET_TN_FSB_CAP)) continue; + irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); + if (irq < 0) + continue; + + sprintf(hdev->name, "hpet%d", i); + hdev->num = i; + hdev->irq = irq; hdev->flags = 0; if (cfg & HPET_TN_PERIODIC_CAP) hdev->flags |= HPET_DEV_PERI_CAP; - hdev->num = i; - - sprintf(hdev->name, "hpet%d", i); - if (hpet_assign_irq(hdev)) - continue; - hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; @@ -709,10 +686,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n, } #else -static int hpet_setup_msi_irq(unsigned int irq) -{ - return 0; -} static void hpet_msi_capability_lookup(unsigned int start_timer) { return; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c deleted file mode 100644 index 009183276bb7..000000000000 --- a/arch/x86/kernel/i387.c +++ /dev/null @@ -1,656 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ -#include <linux/module.h> -#include <linux/regset.h> -#include <linux/sched.h> -#include <linux/slab.h> - -#include <asm/sigcontext.h> -#include <asm/processor.h> -#include <asm/math_emu.h> -#include <asm/tlbflush.h> -#include <asm/uaccess.h> -#include <asm/ptrace.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> -#include <asm/user.h> - -static DEFINE_PER_CPU(bool, in_kernel_fpu); - -void kernel_fpu_disable(void) -{ - WARN_ON(this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, true); -} - -void kernel_fpu_enable(void) -{ - this_cpu_write(in_kernel_fpu, false); -} - -/* - * Were we in an interrupt that interrupted kernel mode? - * - * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - * - * Except for the eagerfpu case when we return true; in the likely case - * the thread has FPU but we are not going to set/clear TS. - */ -static inline bool interrupted_kernel_fpu_idle(void) -{ - if (this_cpu_read(in_kernel_fpu)) - return false; - - if (use_eager_fpu()) - return true; - - return !__thread_has_fpu(current) && - (read_cr0() & X86_CR0_TS); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static inline bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); -} - -/* - * Can we use the FPU in kernel mode with the - * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. - */ -bool irq_fpu_usable(void) -{ - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); -} -EXPORT_SYMBOL(irq_fpu_usable); - -void __kernel_fpu_begin(void) -{ - struct task_struct *me = current; - - this_cpu_write(in_kernel_fpu, true); - - if (__thread_has_fpu(me)) { - __save_init_fpu(me); - } else { - this_cpu_write(fpu_owner_task, NULL); - if (!use_eager_fpu()) - clts(); - } -} -EXPORT_SYMBOL(__kernel_fpu_begin); - -void __kernel_fpu_end(void) -{ - struct task_struct *me = current; - - if (__thread_has_fpu(me)) { - if (WARN_ON(restore_fpu_checking(me))) - fpu_reset_state(me); - } else if (!use_eager_fpu()) { - stts(); - } - - this_cpu_write(in_kernel_fpu, false); -} -EXPORT_SYMBOL(__kernel_fpu_end); - -void unlazy_fpu(struct task_struct *tsk) -{ - preempt_disable(); - if (__thread_has_fpu(tsk)) { - if (use_eager_fpu()) { - __save_fpu(tsk); - } else { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } - } - preempt_enable(); -} -EXPORT_SYMBOL(unlazy_fpu); - -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; -unsigned int xstate_size; -EXPORT_SYMBOL_GPL(xstate_size); -static struct i387_fxsave_struct fx_scratch; - -static void mxcsr_feature_mask_init(void) -{ - unsigned long mask = 0; - - if (cpu_has_fxsr) { - memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : "+m" (fx_scratch)); - mask = fx_scratch.mxcsr_mask; - if (mask == 0) - mask = 0x0000ffbf; - } - mxcsr_feature_mask &= mask; -} - -static void init_thread_xstate(void) -{ - /* - * Note that xstate_size might be overwriten later during - * xsave_init(). - */ - - if (!cpu_has_fpu) { - /* - * Disable xsave as we do not support it if i387 - * emulation is enabled. - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - xstate_size = sizeof(struct i387_soft_struct); - return; - } - - if (cpu_has_fxsr) - xstate_size = sizeof(struct i387_fxsave_struct); - else - xstate_size = sizeof(struct i387_fsave_struct); -} - -/* - * Called at bootup to set up the initial FPU state that is later cloned - * into all processes. - */ - -void fpu_init(void) -{ - unsigned long cr0; - unsigned long cr4_mask = 0; - -#ifndef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) { - pr_emerg("No FPU found and no math emulation present\n"); - pr_emerg("Giving up\n"); - for (;;) - asm volatile("hlt"); - } -#endif - if (cpu_has_fxsr) - cr4_mask |= X86_CR4_OSFXSR; - if (cpu_has_xmm) - cr4_mask |= X86_CR4_OSXMMEXCPT; - if (cr4_mask) - cr4_set_bits(cr4_mask); - - cr0 = read_cr0(); - cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ - if (!cpu_has_fpu) - cr0 |= X86_CR0_EM; - write_cr0(cr0); - - /* - * init_thread_xstate is only called once to avoid overriding - * xstate_size during boot time or during CPU hotplug. - */ - if (xstate_size == 0) - init_thread_xstate(); - - mxcsr_feature_mask_init(); - xsave_init(); - eager_fpu_init(); -} - -void fpu_finit(struct fpu *fpu) -{ - if (!cpu_has_fpu) { - finit_soft_fpu(&fpu->state->soft); - return; - } - - memset(fpu->state, 0, xstate_size); - - if (cpu_has_fxsr) { - fx_finit(&fpu->state->fxsave); - } else { - struct i387_fsave_struct *fp = &fpu->state->fsave; - fp->cwd = 0xffff037fu; - fp->swd = 0xffff0000u; - fp->twd = 0xffffffffu; - fp->fos = 0xffff0000u; - } -} -EXPORT_SYMBOL_GPL(fpu_finit); - -/* - * The _current_ task is using the FPU for the first time - * so initialize it and set the mxcsr to its default - * value at reset if we support XMM instructions and then - * remember the current task has used the FPU. - */ -int init_fpu(struct task_struct *tsk) -{ - int ret; - - if (tsk_used_math(tsk)) { - if (cpu_has_fpu && tsk == current) - unlazy_fpu(tsk); - task_disable_lazy_fpu_restore(tsk); - return 0; - } - - /* - * Memory allocation at the first usage of the FPU and other state. - */ - ret = fpu_alloc(&tsk->thread.fpu); - if (ret) - return ret; - - fpu_finit(&tsk->thread.fpu); - - set_stopped_child_used_math(tsk); - return 0; -} -EXPORT_SYMBOL_GPL(init_fpu); - -/* - * The xstateregs_active() routine is the same as the fpregs_active() routine, - * as the "regset->n" for the xstate regset will be updated based on the feature - * capabilites supported by the xsave. - */ -int fpregs_active(struct task_struct *target, const struct user_regset *regset) -{ - return tsk_used_math(target) ? regset->n : 0; -} - -int xfpregs_active(struct task_struct *target, const struct user_regset *regset) -{ - return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0; -} - -int xfpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - int ret; - - if (!cpu_has_fxsr) - return -ENODEV; - - ret = init_fpu(target); - if (ret) - return ret; - - sanitize_i387_state(target); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fxsave, 0, -1); -} - -int xfpregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - - if (!cpu_has_fxsr) - return -ENODEV; - - ret = init_fpu(target); - if (ret) - return ret; - - sanitize_i387_state(target); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fxsave, 0, -1); - - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; - - /* - * update the header bits in the xsave header, indicating the - * presence of FP and SSE state. - */ - if (cpu_has_xsave) - target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; - - return ret; -} - -int xstateregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - struct xsave_struct *xsave; - int ret; - - if (!cpu_has_xsave) - return -ENODEV; - - ret = init_fpu(target); - if (ret) - return ret; - - xsave = &target->thread.fpu.state->xsave; - - /* - * Copy the 48bytes defined by the software first into the xstate - * memory layout in the thread struct, so that we can copy the entire - * xstateregs to the user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - /* - * Copy the xstate memory layout. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - return ret; -} - -int xstateregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - struct xsave_struct *xsave; - int ret; - - if (!cpu_has_xsave) - return -ENODEV; - - ret = init_fpu(target); - if (ret) - return ret; - - xsave = &target->thread.fpu.state->xsave; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->xsave_hdr.xstate_bv &= pcntxt_mask; - /* - * These bits must be zero. - */ - memset(&xsave->xsave_hdr.reserved, 0, 48); - return ret; -} - -#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION - -/* - * FPU tag word conversions. - */ - -static inline unsigned short twd_i387_to_fxsr(unsigned short twd) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - - return tmp; -} - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) -#define FP_EXP_TAG_VALID 0 -#define FP_EXP_TAG_ZERO 1 -#define FP_EXP_TAG_SPECIAL 2 -#define FP_EXP_TAG_EMPTY 3 - -static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) -{ - struct _fpxreg *st; - u32 tos = (fxsave->swd >> 11) & 7; - u32 twd = (unsigned long) fxsave->twd; - u32 tag; - u32 ret = 0xffff0000u; - int i; - - for (i = 0; i < 8; i++, twd >>= 1) { - if (twd & 0x1) { - st = FPREG_ADDR(fxsave, (i - tos) & 7); - - switch (st->exponent & 0x7fff) { - case 0x7fff: - tag = FP_EXP_TAG_SPECIAL; - break; - case 0x0000: - if (!st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3]) - tag = FP_EXP_TAG_ZERO; - else - tag = FP_EXP_TAG_SPECIAL; - break; - default: - if (st->significand[3] & 0x8000) - tag = FP_EXP_TAG_VALID; - else - tag = FP_EXP_TAG_SPECIAL; - break; - } - } else { - tag = FP_EXP_TAG_EMPTY; - } - ret |= tag << (2 * i); - } - return ret; -} - -/* - * FXSR floating point environment conversions. - */ - -void -convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) -{ - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; - struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; - struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; - int i; - - env->cwd = fxsave->cwd | 0xffff0000u; - env->swd = fxsave->swd | 0xffff0000u; - env->twd = twd_fxsr_to_i387(fxsave); - -#ifdef CONFIG_X86_64 - env->fip = fxsave->rip; - env->foo = fxsave->rdp; - /* - * should be actually ds/cs at fpu exception time, but - * that information is not available in 64bit mode. - */ - env->fcs = task_pt_regs(tsk)->cs; - if (tsk == current) { - savesegment(ds, env->fos); - } else { - env->fos = tsk->thread.ds; - } - env->fos |= 0xffff0000; -#else - env->fip = fxsave->fip; - env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); - env->foo = fxsave->foo; - env->fos = fxsave->fos; -#endif - - for (i = 0; i < 8; ++i) - memcpy(&to[i], &from[i], sizeof(to[0])); -} - -void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env) - -{ - struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; - struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; - struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; - int i; - - fxsave->cwd = env->cwd; - fxsave->swd = env->swd; - fxsave->twd = twd_i387_to_fxsr(env->twd); - fxsave->fop = (u16) ((u32) env->fcs >> 16); -#ifdef CONFIG_X86_64 - fxsave->rip = env->fip; - fxsave->rdp = env->foo; - /* cs and ds ignored */ -#else - fxsave->fip = env->fip; - fxsave->fcs = (env->fcs & 0xffff); - fxsave->foo = env->foo; - fxsave->fos = env->fos; -#endif - - for (i = 0; i < 8; ++i) - memcpy(&to[i], &from[i], sizeof(from[0])); -} - -int fpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) -{ - struct user_i387_ia32_struct env; - int ret; - - ret = init_fpu(target); - if (ret) - return ret; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - - if (!cpu_has_fxsr) - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fsave, 0, - -1); - - sanitize_i387_state(target); - - if (kbuf && pos == 0 && count == sizeof(env)) { - convert_from_fxsr(kbuf, target); - return 0; - } - - convert_from_fxsr(&env, target); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); -} - -int fpregs_set(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - struct user_i387_ia32_struct env; - int ret; - - ret = init_fpu(target); - if (ret) - return ret; - - sanitize_i387_state(target); - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - - if (!cpu_has_fxsr) - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->fsave, 0, - -1); - - if (pos > 0 || count < sizeof(env)) - convert_from_fxsr(&env, target); - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); - if (!ret) - convert_to_fxsr(target, &env); - - /* - * update the header bit in the xsave header, indicating the - * presence of FP. - */ - if (cpu_has_xsave) - target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; - return ret; -} - -/* - * FPU state for core dumps. - * This is only used for a.out dumps now. - * It is declared generically using elf_fpregset_t (which is - * struct user_i387_struct) but is in fact only used for 32-bit - * dumps, so on 64-bit it is really struct user_i387_ia32_struct. - */ -int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) -{ - struct task_struct *tsk = current; - int fpvalid; - - fpvalid = !!used_math(); - if (fpvalid) - fpvalid = !fpregs_get(tsk, NULL, - 0, sizeof(struct user_i387_ia32_struct), - fpu, NULL); - - return fpvalid; -} -EXPORT_SYMBOL(dump_fpu); - -#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ - -static int __init no_387(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FPU); - return 1; -} - -__setup("no387", no_387); - -void fpu_detect(struct cpuinfo_x86 *c) -{ - unsigned long cr0; - u16 fsw, fcw; - - fsw = fcw = 0xffff; - - cr0 = read_cr0(); - cr0 &= ~(X86_CR0_TS | X86_CR0_EM); - write_cr0(cr0); - - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); - - if (fsw == 0 && (fcw & 0x103f) == 0x003f) - set_cpu_cap(c, X86_FEATURE_FPU); - else - clear_cpu_cap(c, X86_FEATURE_FPU); - - /* The final cr0 value is set in fpu_init() */ -} diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index e7cc5370cd2f..16cb827a5b27 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi) */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ + outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); @@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi) outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ + outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e5952c225532..88b366487b0e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,12 @@ #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ @@ -116,6 +122,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_puts(p, " Threshold APIC interrupts\n"); #endif +#ifdef CONFIG_X86_MCE_AMD + seq_printf(p, "%*s: ", prec, "DFR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); + seq_puts(p, " Deferred Error APIC interrupts\n"); +#endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) @@ -136,6 +148,18 @@ int arch_show_interrupts(struct seq_file *p, int prec) #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); #endif +#ifdef CONFIG_HAVE_KVM + seq_printf(p, "%*s: ", prec, "PIN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); + seq_puts(p, " Posted-interrupt notification event\n"); + + seq_printf(p, "%*s: ", prec, "PIW"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_wakeup_ipis); + seq_puts(p, " Posted-interrupt wakeup event\n"); +#endif return 0; } @@ -192,8 +216,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - irq_enter(); - exit_idle(); + entering_irq(); irq = __this_cpu_read(vector_irq[vector]); @@ -209,7 +232,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } } - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); return 1; @@ -237,6 +260,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs) } #ifdef CONFIG_HAVE_KVM +static void dummy_handler(void) {} +static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; + +void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) +{ + if (handler) + kvm_posted_intr_wakeup_handler = handler; + else + kvm_posted_intr_wakeup_handler = dummy_handler; +} +EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); + /* * Handler for POSTED_INTERRUPT_VECTOR. */ @@ -244,16 +279,23 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); - - irq_enter(); - - exit_idle(); - + entering_ack_irq(); inc_irq_stat(kvm_posted_intr_ipis); + exiting_irq(); + set_irq_regs(old_regs); +} - irq_exit(); +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + entering_ack_irq(); + inc_irq_stat(kvm_posted_intr_wakeup_ipis); + kvm_posted_intr_wakeup_handler(); + exiting_irq(); set_irq_regs(old_regs); } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index f9fd86a7fcc7..cd74f5978ab9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -21,12 +21,6 @@ #include <asm/apic.h> -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - #ifdef CONFIG_DEBUG_STACKOVERFLOW int sysctl_panic_on_stackoverflow __read_mostly; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 394e643d7830..bc4604e500a3 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,12 +20,6 @@ #include <asm/idle.h> #include <asm/apic.h> -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - int sysctl_panic_on_stackoverflow; /* diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 15d741ddfeeb..dc5fa6a1e8d6 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -10,12 +10,6 @@ #include <asm/apic.h> #include <asm/trace/irq_vectors.h> -static inline void irq_work_entering_irq(void) -{ - irq_enter(); - ack_APIC_irq(); -} - static inline void __smp_irq_work_interrupt(void) { inc_irq_stat(apic_irq_work_irqs); @@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void) __visible void smp_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); __smp_irq_work_interrupt(); trace_irq_work_exit(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..a3a5e158ed69 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -86,7 +86,7 @@ void __init init_IRQ(void) int i; /* - * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If * these IRQ's are handled by more mordern controllers like IO-APIC, @@ -94,7 +94,7 @@ void __init init_IRQ(void) * irq's migrate etc. */ for (i = 0; i < nr_legacy_irqs(); i++) - per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i; x86_init.irqs.intr_init(); } @@ -135,6 +135,10 @@ static void __init apic_intr_init(void) alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#ifdef CONFIG_X86_MCE_AMD + alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); +#endif + #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -144,6 +148,8 @@ static void __init apic_intr_init(void) #ifdef CONFIG_HAVE_KVM /* IPI for KVM to deliver posted interrupt */ alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); + /* IPI for KVM to deliver interrupt to wake up tasks */ + alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); #endif /* IPI vectors for APIC spurious and error interrupts */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9435620062df..1681504e44a4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu) kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } + +#ifdef CONFIG_QUEUED_SPINLOCKS + +#include <asm/qspinlock.h> + +static void kvm_wait(u8 *ptr, u8 val) +{ + unsigned long flags; + + if (in_nmi()) + return; + + local_irq_save(flags); + + if (READ_ONCE(*ptr) != val) + goto out; + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + local_irq_restore(flags); +} + +#else /* !CONFIG_QUEUED_SPINLOCKS */ + enum kvm_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) } } +#endif /* !CONFIG_QUEUED_SPINLOCKS */ + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; +#ifdef CONFIG_QUEUED_SPINLOCKS + __pv_init_lock_hash(); + pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_lock_ops.wait = kvm_wait; + pv_lock_ops.kick = kvm_kick_cpu; +#else /* !CONFIG_QUEUED_SPINLOCKS */ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); pv_lock_ops.unlock_kick = kvm_unlock_kick; +#endif } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 2d2a237f2c73..30ca7607cbbb 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,8 +19,8 @@ #include <linux/module.h> #include <linux/smp.h> #include <linux/pci.h> -#include <linux/irqdomain.h> +#include <asm/irqdomain.h> #include <asm/mtrr.h> #include <asm/mpspec.h> #include <asm/pgalloc.h> @@ -113,11 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m) pr_warn("Unknown bustype %s - ignoring\n", str); } -static struct irq_domain_ops mp_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, -}; - static void __init MP_ioapic_info(struct mpc_ioapic *m) { struct ioapic_domain_cfg cfg = { diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index bbb6c7316341..33ee3e0efd65 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,11 +8,33 @@ #include <asm/paravirt.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +__visible void __native_queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} + +PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); + +bool pv_is_native_spin_unlock(void) +{ + return pv_lock_ops.queued_spin_unlock.func == + __raw_callee_save___native_queued_spin_unlock; +} +#endif + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP +#ifdef CONFIG_QUEUED_SPINLOCKS + .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), + .wait = paravirt_nop, + .kick = paravirt_nop, +#else /* !CONFIG_QUEUED_SPINLOCKS */ .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), .unlock_kick = paravirt_nop, -#endif +#endif /* !CONFIG_QUEUED_SPINLOCKS */ +#endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c614dd492f5f..58bcfb67c01f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || +#ifdef CONFIG_X86_32 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || +#endif type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ @@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +#if defined(CONFIG_X86_32) .irq_enable_sysexit = native_irq_enable_sysexit, #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index d9f32e6d6ab6..e1b013696dde 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { /* arg in %eax, return in %eax */ @@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) return 0; } +extern bool pv_is_native_spin_unlock(void); + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { @@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_cpu_ops, read_tsc); - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): + if (pv_is_native_spin_unlock()) { + start = start_pv_lock_ops_queued_spin_unlock; + end = end_pv_lock_ops_queued_spin_unlock; + goto patch_site; + } +#endif default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; + +patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; } #undef PATCH_SITE return ret; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index a1da6737ba5b..8aa05583bc42 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) start__mov64, end__mov64); } +extern bool pv_is_native_spin_unlock(void); + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { @@ -49,7 +55,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); PATCH_SITE(pv_cpu_ops, swapgs); @@ -59,14 +64,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): + if (pv_is_native_spin_unlock()) { + start = start_pv_lock_ops_queued_spin_unlock; + end = end_pv_lock_ops_queued_spin_unlock; + goto patch_site; + } +#endif default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; + +patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; } #undef PATCH_SITE return ret; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a25e202bb319..353972c1946c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -140,6 +140,51 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + void *memory; + + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) + return memory; + + if (!dev) + dev = &x86_dma_fallback_dev; + + if (!is_device_dma_capable(dev)) + return NULL; + + if (!ops->alloc) + return NULL; + + memory = ops->alloc(dev, size, dma_handle, + dma_alloc_coherent_gfp_flags(dev, gfp), attrs); + debug_dma_alloc_coherent(dev, size, *dma_handle, memory); + + return memory; +} +EXPORT_SYMBOL(dma_alloc_attrs); + +void dma_free_attrs(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + WARN_ON(irqs_disabled()); /* for portability */ + + if (dma_release_from_coherent(dev, get_order(size), vaddr)) + return; + + debug_dma_free_coherent(dev, size, vaddr, bus); + if (ops->free) + ops->free(dev, size, vaddr, bus, attrs); +} +EXPORT_SYMBOL(dma_free_attrs); + /* * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel * parameter documentation. diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6e338e3b1dc0..b53f6c939bc2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -25,8 +25,7 @@ #include <asm/idle.h> #include <asm/uaccess.h> #include <asm/mwait.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> @@ -76,9 +75,6 @@ void idle_notifier_unregister(struct notifier_block *n) EXPORT_SYMBOL_GPL(idle_notifier_unregister); #endif -struct kmem_cache *task_xstate_cachep; -EXPORT_SYMBOL_GPL(task_xstate_cachep); - /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. @@ -87,36 +83,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; - dst->thread.fpu_counter = 0; - dst->thread.fpu.has_fpu = 0; - dst->thread.fpu.state = NULL; - task_disable_lazy_fpu_restore(dst); - if (tsk_used_math(src)) { - int err = fpu_alloc(&dst->thread.fpu); - if (err) - return err; - fpu_copy(dst, src); - } - return 0; -} - -void free_thread_xstate(struct task_struct *tsk) -{ - fpu_free(&tsk->thread.fpu); -} - -void arch_release_task_struct(struct task_struct *tsk) -{ - free_thread_xstate(tsk); -} - -void arch_task_cache_init(void) -{ - task_xstate_cachep = - kmem_cache_create("task_xstate", xstate_size, - __alignof__(union thread_xstate), - SLAB_PANIC | SLAB_NOTRACK, NULL); - setup_xstate_comp(); + return fpu__copy(&dst->thread.fpu, &src->thread.fpu); } /* @@ -127,6 +94,7 @@ void exit_thread(void) struct task_struct *me = current; struct thread_struct *t = &me->thread; unsigned long *bp = t->io_bitmap_ptr; + struct fpu *fpu = &t->fpu; if (bp) { struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); @@ -142,7 +110,7 @@ void exit_thread(void) kfree(bp); } - drop_fpu(me); + fpu__drop(fpu); } void flush_thread(void) @@ -152,19 +120,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - if (!use_eager_fpu()) { - /* FPU state will be reallocated lazily at the first use. */ - drop_fpu(tsk); - free_thread_xstate(tsk); - } else { - if (!tsk_used_math(tsk)) { - /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(init_fpu(tsk))) - force_sig(SIGKILL, tsk); - user_fpu_begin(); - } - restore_init_xstate(); - } + fpu__clear(&tsk->thread.fpu); } static void hard_disable_TSC(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8ed2106b06da..c09c99ccf3e3 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -39,8 +39,7 @@ #include <asm/pgtable.h> #include <asm/ldt.h> #include <asm/processor.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> #include <asm/desc.h> #ifdef CONFIG_MATH_EMULATION #include <asm/math_emu.h> @@ -242,14 +241,16 @@ __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; + *next = &next_p->thread; + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - fpu_switch_t fpu; + fpu_switch_t fpu_switch; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu = switch_fpu_prepare(prev_p, next_p, cpu); + fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -296,19 +297,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up + * done before fpu__restore(), so the TS bit is up * to date. */ arch_end_context_switch(next_p); /* - * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); @@ -319,7 +317,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_p, fpu); + switch_fpu_finish(next_fpu, fpu_switch); this_cpu_write(current_task, next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ddfdbf74f174..843f92e4c711 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -38,8 +38,7 @@ #include <asm/pgtable.h> #include <asm/processor.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> @@ -274,12 +273,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned fsindex, gsindex; - fpu_switch_t fpu; + fpu_switch_t fpu_switch; - fpu = switch_fpu_prepare(prev_p, next_p, cpu); + fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -299,7 +300,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Leave lazy mode, flushing any hypercalls made here. This * must be done after loading TLS entries in the GDT but before * loading segments that might reference them, and and it must - * be done before math_state_restore, so the TS bit is up to + * be done before fpu__restore(), so the TS bit is up to * date. */ arch_end_context_switch(next_p); @@ -391,7 +392,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - switch_fpu_finish(next_p, fpu); + switch_fpu_finish(next_fpu, fpu_switch); /* * Switch the PDA and FPU contexts. @@ -409,9 +410,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE); - /* * Now maybe reload the debug registers and handle I/O bitmaps */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a7bc79480719..9be72bc3613f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -11,7 +11,6 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/ptrace.h> -#include <linux/regset.h> #include <linux/tracehook.h> #include <linux/user.h> #include <linux/elf.h> @@ -28,8 +27,9 @@ #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/processor.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> +#include <asm/fpu/signal.h> +#include <asm/fpu/regset.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> @@ -1297,7 +1297,7 @@ static struct user_regset x86_64_regsets[] __read_mostly = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), - .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set }, [REGSET_XSTATE] = { .core_note_type = NT_X86_XSTATE, @@ -1338,13 +1338,13 @@ static struct user_regset x86_32_regsets[] __read_mostly = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = fpregs_active, .get = fpregs_get, .set = fpregs_set + .active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set }, [REGSET_XFP] = { .core_note_type = NT_PRXFPREG, .n = sizeof(struct user32_fxsr_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set }, [REGSET_XSTATE] = { .core_note_type = NT_X86_XSTATE, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d74ac33290ae..8d04a7594a03 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1222,8 +1222,7 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - if (x86_io_apic_ops.init) - x86_io_apic_ops.init(); + io_apic_init_mappings(); kvm_guest_init(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 1ea14fd53933..206996c1669d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -26,8 +26,8 @@ #include <asm/processor.h> #include <asm/ucontext.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> +#include <asm/fpu/signal.h> #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> @@ -103,7 +103,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) get_user_ex(buf, &sc->fpstate); } get_user_catch(err); - err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); + err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32)); force_iret(); @@ -199,6 +199,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long sp = regs->sp; unsigned long buf_fx = 0; int onsigstack = on_sig_stack(sp); + struct fpu *fpu = ¤t->thread.fpu; /* redzone */ if (config_enabled(CONFIG_X86_64)) @@ -218,9 +219,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } } - if (used_math()) { - sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), - &buf_fx, &math_size); + if (fpu->fpstate_active) { + sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + &buf_fx, &math_size); *fpstate = (void __user *)sp; } @@ -234,8 +235,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (used_math() && - save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0) + if (fpu->fpstate_active && + copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; return (void __user *)sp; @@ -593,6 +594,22 @@ badframe: return 0; } +static inline int is_ia32_compat_frame(void) +{ + return config_enabled(CONFIG_IA32_EMULATION) && + test_thread_flag(TIF_IA32); +} + +static inline int is_ia32_frame(void) +{ + return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); +} + +static inline int is_x32_frame(void) +{ + return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); +} + static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { @@ -617,6 +634,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; + struct fpu *fpu = ¤t->thread.fpu; /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { @@ -665,8 +683,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (used_math()) - fpu_reset_state(current); + if (fpu->fpstate_active) + fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index be8e1bde07aa..15aaa69bbb5e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { - ack_APIC_irq(); - irq_enter(); + ipi_entering_ack_irq(); stop_this_cpu(NULL); irq_exit(); } @@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -static inline void smp_entering_irq(void) -{ - ack_APIC_irq(); - irq_enter(); -} - __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* @@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) * scheduler_ipi(). This is OK, since those functions are allowed * to nest. */ - smp_entering_irq(); + ipi_entering_ack_irq(); trace_reschedule_entry(RESCHEDULE_VECTOR); __smp_reschedule_interrupt(); trace_reschedule_exit(RESCHEDULE_VECTOR); @@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void) __visible void smp_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); __smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); @@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void) __visible void smp_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); __smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..d6e804d3f5e3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,8 +68,7 @@ #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -514,6 +513,40 @@ void __inquire_remote_apic(int apicid) } /* + * The Multiprocessor Specification 1.4 (1997) example code suggests + * that there should be a 10ms delay between the BSP asserting INIT + * and de-asserting INIT, when starting a remote processor. + * But that slows boot and resume on modern processors, which include + * many cores and don't require that delay. + * + * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + * Modern processor families are quirked to remove the delay entirely. + */ +#define UDELAY_10MS_DEFAULT 10000 + +static unsigned int init_udelay = UDELAY_10MS_DEFAULT; + +static int __init cpu_init_udelay(char *str) +{ + get_option(&str, &init_udelay); + + return 0; +} +early_param("cpu_init_udelay", cpu_init_udelay); + +static void __init smp_quirk_init_udelay(void) +{ + /* if cmdline changed it from default, leave it alone */ + if (init_udelay != UDELAY_10MS_DEFAULT) + return; + + /* if modern processor, use no delay */ + if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + init_udelay = 0; +} + +/* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. @@ -555,7 +588,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { - unsigned long send_status, accept_status = 0; + unsigned long send_status = 0, accept_status = 0; int maxlvt, num_starts, j; maxlvt = lapic_get_maxlvt(); @@ -583,7 +616,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); - mdelay(10); + udelay(init_udelay); pr_debug("Deasserting INIT\n"); @@ -651,6 +684,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); @@ -792,8 +826,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; } /* @@ -1176,6 +1208,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); set_mtrr_aps_delayed_init(); + + smp_quirk_init_udelay(); } void arch_enable_nonboot_cpus_begin(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 324ab5247687..b5e7687668c5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,8 +54,7 @@ #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> @@ -372,7 +371,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - struct xsave_struct *xsave_buf; + struct xregs_state *xsave_buf; enum ctx_state prev_state; struct bndcsr *bndcsr; siginfo_t *info; @@ -396,8 +395,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) * It is not directly accessible, though, so we need to * do an xsave and then pull it out of the xsave buffer. */ - fpu_save_init(&tsk->thread.fpu); - xsave_buf = &(tsk->thread.fpu.state->xsave); + copy_fpregs_to_fpstate(&tsk->thread.fpu); + xsave_buf = &(tsk->thread.fpu.state.xsave); bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); if (!bndcsr) goto exit_trap; @@ -709,8 +708,8 @@ NOKPROBE_SYMBOL(do_debug); static void math_error(struct pt_regs *regs, int error_code, int trapnr) { struct task_struct *task = current; + struct fpu *fpu = &task->thread.fpu; siginfo_t info; - unsigned short err; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; @@ -718,8 +717,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) return; conditional_sti(regs); - if (!user_mode(regs)) - { + if (!user_mode(regs)) { if (!fixup_exception(regs)) { task->thread.error_code = error_code; task->thread.trap_nr = trapnr; @@ -731,62 +729,20 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) /* * Save the info for the exception handler and clear the error. */ - unlazy_fpu(task); - task->thread.trap_nr = trapnr; + fpu__save(fpu); + + task->thread.trap_nr = trapnr; task->thread.error_code = error_code; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_addr = (void __user *)uprobe_get_trap_addr(regs); - if (trapnr == X86_TRAP_MF) { - unsigned short cwd, swd; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_addr = (void __user *)uprobe_get_trap_addr(regs); - err = swd & ~cwd; - } else { - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - unsigned short mxcsr = get_fpu_mxcsr(task); - err = ~(mxcsr >> 7) & mxcsr; - } + info.si_code = fpu__exception_code(fpu, trapnr); - if (err & 0x001) { /* Invalid op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - } else if (err & 0x004) { /* Divide by Zero */ - info.si_code = FPE_FLTDIV; - } else if (err & 0x008) { /* Overflow */ - info.si_code = FPE_FLTOVF; - } else if (err & 0x012) { /* Denormal, Underflow */ - info.si_code = FPE_FLTUND; - } else if (err & 0x020) { /* Precision */ - info.si_code = FPE_FLTRES; - } else { - /* - * If we're using IRQ 13, or supposedly even some trap - * X86_TRAP_MF implementations, it's possible - * we get a spurious trap, which is not an error. - */ + /* Retry when we get spurious exceptions: */ + if (!info.si_code) return; - } + force_sig_info(SIGFPE, &info, task); } @@ -813,62 +769,8 @@ dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) -{ } -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (eg with local - * local interrupts as in the case of do_device_not_available). - */ -void math_state_restore(void) -{ - struct task_struct *tsk = current; - - if (!tsk_used_math(tsk)) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(tsk)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */ - kernel_fpu_disable(); - __thread_fpu_begin(tsk); - if (unlikely(restore_fpu_checking(tsk))) { - fpu_reset_state(tsk); - force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); - } else { - tsk->thread.fpu_counter++; - } - kernel_fpu_enable(); -} -EXPORT_SYMBOL_GPL(math_state_restore); - dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { @@ -889,7 +791,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) return; } #endif - math_state_restore(); /* interrupts still off */ + fpu__restore(¤t->thread.fpu); /* interrupts still off */ #ifdef CONFIG_X86_32 conditional_sti(regs); #endif @@ -997,8 +899,8 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - set_bit(SYSCALL_VECTOR, used_vectors); + set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 234b0722de53..3cee10abf01d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -111,11 +111,9 @@ EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, - .compose_msi_msg = native_compose_msi_msg, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, - .setup_hpet_msi = default_setup_hpet_msi, }; /* MSI arch specific hooks */ @@ -141,13 +139,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) #endif struct x86_io_apic_ops x86_io_apic_ops = { - .init = native_io_apic_init_mappings, .read = native_io_apic_read, - .write = native_io_apic_write, - .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .print_entries = native_io_apic_print_entries, - .set_affinity = native_ioapic_set_affinity, - .setup_entry = native_setup_ioapic_entry, - .eoi_ioapic_pin = native_eoi_ioapic_pin, }; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c deleted file mode 100644 index 87a815b85f3e..000000000000 --- a/arch/x86/kernel/xsave.c +++ /dev/null @@ -1,724 +0,0 @@ -/* - * xsave/xrstor support. - * - * Author: Suresh Siddha <suresh.b.siddha@intel.com> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/bootmem.h> -#include <linux/compat.h> -#include <linux/cpu.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> -#include <asm/sigframe.h> -#include <asm/tlbflush.h> -#include <asm/xcr.h> - -/* - * Supported feature mask by the CPU and the kernel. - */ -u64 pcntxt_mask; - -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct *init_xstate_buf; - -static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; -static unsigned int *xstate_offsets, *xstate_sizes; -static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8]; -static unsigned int xstate_features; - -/* - * If a processor implementation discern that a processor state component is - * in its initialized state it may modify the corresponding bit in the - * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory - * layout in the case of xsaveopt. While presenting the xstate information to - * the user, we always ensure that the memory layout of a feature will be in - * the init state if the corresponding header bit is zero. This is to ensure - * that the user doesn't see some stale state in the memory layout during - * signal handling, debugging etc. - */ -void __sanitize_i387_state(struct task_struct *tsk) -{ - struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; - int feature_bit = 0x2; - u64 xstate_bv; - - if (!fx) - return; - - xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; - - /* - * None of the feature bits are in init state. So nothing else - * to do for us, as the memory layout is up to date. - */ - if ((xstate_bv & pcntxt_mask) == pcntxt_mask) - return; - - /* - * FP is in init state - */ - if (!(xstate_bv & XSTATE_FP)) { - fx->cwd = 0x37f; - fx->swd = 0; - fx->twd = 0; - fx->fop = 0; - fx->rip = 0; - fx->rdp = 0; - memset(&fx->st_space[0], 0, 128); - } - - /* - * SSE is in init state - */ - if (!(xstate_bv & XSTATE_SSE)) - memset(&fx->xmm_space[0], 0, 256); - - xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; - - /* - * Update all the other memory layouts for which the corresponding - * header bit is in the init state. - */ - while (xstate_bv) { - if (xstate_bv & 0x1) { - int offset = xstate_offsets[feature_bit]; - int size = xstate_sizes[feature_bit]; - - memcpy(((void *) fx) + offset, - ((void *) init_xstate_buf) + offset, - size); - } - - xstate_bv >>= 1; - feature_bit++; - } -} - -/* - * Check for the presence of extended state information in the - * user fpstate pointer in the sigcontext. - */ -static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *fx_sw) -{ - int min_xstate_size = sizeof(struct i387_fxsave_struct) + - sizeof(struct xsave_hdr_struct); - unsigned int magic2; - - if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) - return -1; - - /* Check for the first magic field and other error scenarios. */ - if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || - fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > xstate_size || - fx_sw->xstate_size > fx_sw->extended_size) - return -1; - - /* - * Check for the presence of second magic word at the end of memory - * layout. This detects the case where the user just copied the legacy - * fpstate layout with out copying the extended state information - * in the memory layout. - */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) - || magic2 != FP_XSTATE_MAGIC2) - return -1; - - return 0; -} - -/* - * Signal frame handlers. - */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) -{ - if (use_fxsr()) { - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; - struct user_i387_ia32_struct env; - struct _fpstate_ia32 __user *fp = buf; - - convert_from_fxsr(&env, tsk); - - if (__copy_to_user(buf, &env, sizeof(env)) || - __put_user(xsave->i387.swd, &fp->status) || - __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; - } else { - struct i387_fsave_struct __user *fp = buf; - u32 swd; - if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; - } - - return 0; -} - -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) -{ - struct xsave_struct __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; - u32 xstate_bv; - int err; - - /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); - - if (!use_xsave()) - return err; - - err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); - - /* - * Read the xstate_bv which we copied (directly from the cpu or - * from the state in task struct) to the user buffers. - */ - err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); - - /* - * For legacy compatible, we always set FP/SSE bits in the bit - * vector while saving the state to the user context. This will - * enable us capturing any changes(during sigreturn) to - * the FP/SSE bits by the legacy applications which don't touch - * xstate_bv in the xsave header. - * - * xsave aware apps can change the xstate_bv in the xsave - * header as well as change any contents in the memory layout. - * xrestore as part of sigreturn will capture all the changes. - */ - xstate_bv |= XSTATE_FPSSE; - - err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); - - return err; -} - -static inline int save_user_xstate(struct xsave_struct __user *buf) -{ - int err; - - if (use_xsave()) - err = xsave_user(buf); - else if (use_fxsr()) - err = fxsave_user((struct i387_fxsave_struct __user *) buf); - else - err = fsave_user((struct i387_fsave_struct __user *) buf); - - if (unlikely(err) && __clear_user(buf, xstate_size)) - err = -EFAULT; - return err; -} - -/* - * Save the fpu, extended register state to the user signal frame. - * - * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save - * state is copied. - * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. - * - * buf == buf_fx for 64-bit frames and 32-bit fsave frame. - * buf != buf_fx for 32-bit frames with fxstate. - * - * If the fpu, extended register state is live, save the state directly - * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, - * copy the thread's fpu state to the user frame starting at 'buf_fx'. - * - * If this is a 32-bit frame with fxstate, put a fsave header before - * the aligned state at 'buf_fx'. - * - * For [f]xsave state, update the SW reserved fields in the [f]xsave frame - * indicating the absence/presence of the extended state to the user. - */ -int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) -{ - struct xsave_struct *xsave = ¤t->thread.fpu.state->xsave; - struct task_struct *tsk = current; - int ia32_fxstate = (buf != buf_fx); - - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); - - if (!access_ok(VERIFY_WRITE, buf, size)) - return -EACCES; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), NULL, - (struct _fpstate_ia32 __user *) buf) ? -1 : 1; - - if (user_has_fpu()) { - /* Save the live register state to the user directly. */ - if (save_user_xstate(buf_fx)) - return -1; - /* Update the thread's fxstate to save the fsave header. */ - if (ia32_fxstate) - fpu_fxsave(&tsk->thread.fpu); - } else { - sanitize_i387_state(tsk); - if (__copy_to_user(buf_fx, xsave, xstate_size)) - return -1; - } - - /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; - - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; - - return 0; -} - -static inline void -sanitize_restored_xstate(struct task_struct *tsk, - struct user_i387_ia32_struct *ia32_env, - u64 xstate_bv, int fx_only) -{ - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; - struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr; - - if (use_xsave()) { - /* These bits must be zero. */ - memset(xsave_hdr->reserved, 0, 48); - - /* - * Init the state that is not present in the memory - * layout and not enabled by the OS. - */ - if (fx_only) - xsave_hdr->xstate_bv = XSTATE_FPSSE; - else - xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv); - } - - if (use_fxsr()) { - /* - * mscsr reserved bits must be masked to zero for security - * reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - - convert_to_fxsr(tsk, ia32_env); - } -} - -/* - * Restore the extended state if present. Otherwise, restore the FP/SSE state. - */ -static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) -{ - if (use_xsave()) { - if ((unsigned long)buf % 64 || fx_only) { - u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; - xrstor_state(init_xstate_buf, init_bv); - return fxrstor_user(buf); - } else { - u64 init_bv = pcntxt_mask & ~xbv; - if (unlikely(init_bv)) - xrstor_state(init_xstate_buf, init_bv); - return xrestore_user(buf, xbv); - } - } else if (use_fxsr()) { - return fxrstor_user(buf); - } else - return frstor_user(buf); -} - -int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) -{ - int ia32_fxstate = (buf != buf_fx); - struct task_struct *tsk = current; - int state_size = xstate_size; - u64 xstate_bv = 0; - int fx_only = 0; - - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); - - if (!buf) { - fpu_reset_state(tsk); - return 0; - } - - if (!access_ok(VERIFY_READ, buf, size)) - return -EACCES; - - if (!used_math() && init_fpu(tsk)) - return -1; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; - - if (use_xsave()) { - struct _fpx_sw_bytes fx_sw_user; - if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { - /* - * Couldn't find the extended state information in the - * memory layout. Restore just the FP/SSE and init all - * the other extended state. - */ - state_size = sizeof(struct i387_fxsave_struct); - fx_only = 1; - } else { - state_size = fx_sw_user.xstate_size; - xstate_bv = fx_sw_user.xstate_bv; - } - } - - if (ia32_fxstate) { - /* - * For 32-bit frames with fxstate, copy the user state to the - * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. - */ - struct fpu *fpu = &tsk->thread.fpu; - struct user_i387_ia32_struct env; - int err = 0; - - /* - * Drop the current fpu which clears used_math(). This ensures - * that any context-switch during the copy of the new state, - * avoids the intermediate state from getting restored/saved. - * Thus avoiding the new restored state from getting corrupted. - * We will be ready to restore/save the state only after - * set_used_math() is again set. - */ - drop_fpu(tsk); - - if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { - fpu_finit(fpu); - err = -1; - } else { - sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); - } - - set_used_math(); - if (use_eager_fpu()) { - preempt_disable(); - math_state_restore(); - preempt_enable(); - } - - return err; - } else { - /* - * For 64-bit frames and 32-bit fsave frames, restore the user - * state to the registers directly (with exceptions handled). - */ - user_fpu_begin(); - if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { - fpu_reset_state(tsk); - return -1; - } - } - - return 0; -} - -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -static void prepare_fx_sw_frame(void) -{ - int fsave_header_size = sizeof(struct i387_fsave_struct); - int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xstate_bv = pcntxt_mask; - fx_sw_reserved.xstate_size = xstate_size; - - if (config_enabled(CONFIG_IA32_EMULATION)) { - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; - } -} - -/* - * Enable the extended processor state save/restore feature - */ -static inline void xstate_enable(void) -{ - cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); -} - -/* - * Record the offsets and sizes of different state managed by the xsave - * memory layout. - */ -static void __init setup_xstate_features(void) -{ - int eax, ebx, ecx, edx, leaf = 0x2; - - xstate_features = fls64(pcntxt_mask); - xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); - xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); - - do { - cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); - - if (eax == 0) - break; - - xstate_offsets[leaf] = ebx; - xstate_sizes[leaf] = eax; - - leaf++; - } while (1); -} - -/* - * This function sets up offsets and sizes of all extended states in - * xsave area. This supports both standard format and compacted format - * of the xsave aread. - * - * Input: void - * Output: void - */ -void setup_xstate_comp(void) -{ - unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8]; - int i; - - /* - * The FP xstates and SSE xstates are legacy states. They are always - * in the fixed offsets in the xsave area in either compacted form - * or standard form. - */ - xstate_comp_offsets[0] = 0; - xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); - - if (!cpu_has_xsaves) { - for (i = 2; i < xstate_features; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) { - xstate_comp_offsets[i] = xstate_offsets[i]; - xstate_comp_sizes[i] = xstate_sizes[i]; - } - } - return; - } - - xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; - - for (i = 2; i < xstate_features; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) - xstate_comp_sizes[i] = xstate_sizes[i]; - else - xstate_comp_sizes[i] = 0; - - if (i > 2) - xstate_comp_offsets[i] = xstate_comp_offsets[i-1] - + xstate_comp_sizes[i-1]; - - } -} - -/* - * setup the xstate image representing the init state - */ -static void __init setup_init_fpu_buf(void) -{ - /* - * Setup init_xstate_buf to represent the init state of - * all the features managed by the xsave - */ - init_xstate_buf = alloc_bootmem_align(xstate_size, - __alignof__(struct xsave_struct)); - fx_finit(&init_xstate_buf->i387); - - if (!cpu_has_xsave) - return; - - setup_xstate_features(); - - if (cpu_has_xsaves) { - init_xstate_buf->xsave_hdr.xcomp_bv = - (u64)1 << 63 | pcntxt_mask; - init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask; - } - - /* - * Init all the features state with header_bv being 0x0 - */ - xrstor_state_booting(init_xstate_buf, -1); - /* - * Dump the init state again. This is to identify the init state - * of any feature which is not represented by all zero's. - */ - xsave_state_booting(init_xstate_buf, -1); -} - -static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; -static int __init eager_fpu_setup(char *s) -{ - if (!strcmp(s, "on")) - eagerfpu = ENABLE; - else if (!strcmp(s, "off")) - eagerfpu = DISABLE; - else if (!strcmp(s, "auto")) - eagerfpu = AUTO; - return 1; -} -__setup("eagerfpu=", eager_fpu_setup); - - -/* - * Calculate total size of enabled xstates in XCR0/pcntxt_mask. - */ -static void __init init_xstate_size(void) -{ - unsigned int eax, ebx, ecx, edx; - int i; - - if (!cpu_has_xsaves) { - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; - return; - } - - xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < 64; i++) { - if (test_bit(i, (unsigned long *)&pcntxt_mask)) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_size += eax; - } - } -} - -/* - * Enable and initialize the xsave feature. - */ -static void __init xstate_enable_boot_cpu(void) -{ - unsigned int eax, ebx, ecx, edx; - - if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { - WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); - return; - } - - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - pcntxt_mask = eax + ((u64)edx << 32); - - if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - pr_err("FP/SSE not shown under xsave features 0x%llx\n", - pcntxt_mask); - BUG(); - } - - /* - * Support only the state known to OS. - */ - pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - - xstate_enable(); - - /* - * Recompute the context size for enabled features - */ - init_xstate_size(); - - update_regset_xstate_info(xstate_size, pcntxt_mask); - prepare_fx_sw_frame(); - setup_init_fpu_buf(); - - /* Auto enable eagerfpu for xsaveopt */ - if (cpu_has_xsaveopt && eagerfpu != DISABLE) - eagerfpu = ENABLE; - - if (pcntxt_mask & XSTATE_EAGER) { - if (eagerfpu == DISABLE) { - pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n", - pcntxt_mask & XSTATE_EAGER); - pcntxt_mask &= ~XSTATE_EAGER; - } else { - eagerfpu = ENABLE; - } - } - - pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n", - pcntxt_mask, xstate_size, - cpu_has_xsaves ? "compacted form" : "standard form"); -} - -/* - * For the very first instance, this calls xstate_enable_boot_cpu(); - * for all subsequent instances, this calls xstate_enable(). - * - * This is somewhat obfuscated due to the lack of powerful enough - * overrides for the section checks. - */ -void xsave_init(void) -{ - static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; - void (*this_func)(void); - - if (!cpu_has_xsave) - return; - - this_func = next_func; - next_func = xstate_enable; - this_func(); -} - -/* - * setup_init_fpu_buf() is __init and it is OK to call it here because - * init_xstate_buf will be unset only once during boot. - */ -void __init_refok eager_fpu_init(void) -{ - WARN_ON(used_math()); - current_thread_info()->status = 0; - - if (eagerfpu == ENABLE) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); - - if (!cpu_has_eager_fpu) { - stts(); - return; - } - - if (!init_xstate_buf) - setup_init_fpu_buf(); -} - -/* - * Given the xsave area and a state inside, this function returns the - * address of the state. - * - * This is the API that is called to get xstate address in either - * standard format or compacted format of xsave area. - * - * Inputs: - * xsave: base address of the xsave area; - * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, - * etc.) - * Output: - * address of the state in the xsave area. - */ -void *get_xsave_addr(struct xsave_struct *xsave, int xstate) -{ - int feature = fls64(xstate) - 1; - if (!test_bit(feature, (unsigned long *)&pcntxt_mask)) - return NULL; - - return (void *)xsave + xstate_comp_offsets[feature]; -} -EXPORT_SYMBOL_GPL(get_xsave_addr); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1d08ad3582d0..9f705e618af5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,10 +16,8 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/i387.h> /* For use_eager_fpu. Ugh! */ -#include <asm/fpu-internal.h> /* For use_eager_fpu. Ugh! */ #include <asm/user.h> -#include <asm/xsave.h> +#include <asm/fpu/xstate.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2d73807f0d31..e11dd59398f1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -40,8 +40,7 @@ #include <asm/vmx.h> #include <asm/virtext.h> #include <asm/mce.h> -#include <asm/i387.h> -#include <asm/xcr.h> +#include <asm/fpu/internal.h> #include <asm/perf_event.h> #include <asm/debugreg.h> #include <asm/kexec.h> @@ -1883,7 +1882,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. */ - if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) + if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) stts(); load_gdt(this_cpu_ptr(&host_gdt)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea306adbbc13..989cfc01e2a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -59,9 +59,8 @@ #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/mce.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> /* Ugh! */ -#include <asm/xcr.h> +#include <linux/kernel_stat.h> +#include <asm/fpu/internal.h> /* Ugh! */ #include <asm/pvclock.h> #include <asm/div64.h> @@ -3194,8 +3193,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { - struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; - u64 xstate_bv = xsave->xsave_hdr.xstate_bv; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + u64 xstate_bv = xsave->header.xfeatures; u64 valid; /* @@ -3230,7 +3229,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { - struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; @@ -3241,9 +3240,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) memcpy(xsave, src, XSAVE_HDR_OFFSET); /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->xsave_hdr.xstate_bv = xstate_bv; + xsave->header.xfeatures = xstate_bv; if (cpu_has_xsaves) - xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; + xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* * Copy each region from the non-compacted offset to the @@ -3275,8 +3274,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->fxsave, - sizeof(struct i387_fxsave_struct)); + &vcpu->arch.guest_fpu.state.fxsave, + sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XSTATE_FPSSE; } @@ -3300,8 +3299,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state->fxsave, - guest_xsave->region, sizeof(struct i387_fxsave_struct)); + memcpy(&vcpu->arch.guest_fpu.state.fxsave, + guest_xsave->region, sizeof(struct fxregs_state)); } return 0; } @@ -6597,11 +6596,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct fpu *fpu = ¤t->thread.fpu; int r; sigset_t sigsaved; - if (!tsk_used_math(current) && init_fpu(current)) - return -ENOMEM; + fpu__activate_curr(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -6971,8 +6970,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + struct fxregs_state *fxsave = + &vcpu->arch.guest_fpu.state.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -6988,8 +6987,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + struct fxregs_state *fxsave = + &vcpu->arch.guest_fpu.state.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -7003,17 +7002,11 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -int fx_init(struct kvm_vcpu *vcpu) +static void fx_init(struct kvm_vcpu *vcpu) { - int err; - - err = fpu_alloc(&vcpu->arch.guest_fpu); - if (err) - return err; - - fpu_finit(&vcpu->arch.guest_fpu); + fpstate_init(&vcpu->arch.guest_fpu.state); if (cpu_has_xsaves) - vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv = + vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -7022,14 +7015,6 @@ int fx_init(struct kvm_vcpu *vcpu) vcpu->arch.xcr0 = XSTATE_FP; vcpu->arch.cr0 |= X86_CR0_ET; - - return 0; -} -EXPORT_SYMBOL_GPL(fx_init); - -static void fx_free(struct kvm_vcpu *vcpu) -{ - fpu_free(&vcpu->arch.guest_fpu); } void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) @@ -7045,7 +7030,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); - fpu_restore_checking(&vcpu->arch.guest_fpu); + __copy_fpstate_to_fpregs(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } @@ -7057,7 +7042,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 0; - fpu_save_init(&vcpu->arch.guest_fpu); + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; if (!vcpu->arch.eager_fpu) @@ -7071,7 +7056,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -7137,7 +7121,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -7363,9 +7346,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; } - r = fx_init(vcpu); - if (r) - goto fail_free_wbinvd_dirty_mask; + fx_init(vcpu); vcpu->arch.ia32_tsc_adjust_msr = 0x0; vcpu->arch.pv_time_enabled = false; @@ -7379,8 +7360,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_pmu_init(vcpu); return 0; -fail_free_wbinvd_dirty_mask: - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 8f9a133cc099..f2dc08c003eb 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -70,7 +70,7 @@ #include <asm/e820.h> #include <asm/mce.h> #include <asm/io.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> #include <asm/stackprotector.h> #include <asm/reboot.h> /* for struct machine_ops */ #include <asm/kvm_para.h> @@ -90,7 +90,7 @@ struct lguest_data lguest_data = { .noirq_iret = (u32)lguest_noirq_iret, .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = SYSCALL_VECTOR, + .syscall_vec = IA32_SYSCALL_VECTOR, }; /*G:037 @@ -866,7 +866,7 @@ static void __init lguest_init_IRQ(void) for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { /* Some systems map "vectors" to interrupts weirdly. Not us! */ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); - if (i != SYSCALL_VECTOR) + if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR)); } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 1530afb07c85..982989d282ff 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -40,6 +40,6 @@ else lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o copy_user_nocache_64.o + lib-y += copy_user_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index fa997dfaef24..e4b3beee83bd 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,30 +16,6 @@ #include <asm/asm.h> #include <asm/smap.h> - .macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(100b,103b) - _ASM_EXTABLE(101b,103b) - .endm - /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) CFI_STARTPROC @@ -266,3 +242,95 @@ ENTRY(copy_user_enhanced_fast_string) _ASM_EXTABLE(1b,12b) CFI_ENDPROC ENDPROC(copy_user_enhanced_fast_string) + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + ASM_STAC + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movnti %r8,(%rdi) +6: movnti %r9,1*8(%rdi) +7: movnti %r10,2*8(%rdi) +8: movnti %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movnti %r8,4*8(%rdi) +14: movnti %r9,5*8(%rdi) +15: movnti %r10,6*8(%rdi) +16: movnti %r11,7*8(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + decl %ecx + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movnti %r8,(%rdi) + leaq 8(%rsi),%rsi + leaq 8(%rdi),%rdi + decl %ecx + jnz 18b +20: andl %edx,%edx + jz 23f + movl %edx,%ecx +21: movb (%rsi),%al +22: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 21b +23: xorl %eax,%eax + ASM_CLAC + sfence + ret + + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: lea (%rdx,%rcx,8),%rdx + jmp 60f +50: movl %ecx,%edx +60: sfence + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(1b,30b) + _ASM_EXTABLE(2b,30b) + _ASM_EXTABLE(3b,30b) + _ASM_EXTABLE(4b,30b) + _ASM_EXTABLE(5b,30b) + _ASM_EXTABLE(6b,30b) + _ASM_EXTABLE(7b,30b) + _ASM_EXTABLE(8b,30b) + _ASM_EXTABLE(9b,30b) + _ASM_EXTABLE(10b,30b) + _ASM_EXTABLE(11b,30b) + _ASM_EXTABLE(12b,30b) + _ASM_EXTABLE(13b,30b) + _ASM_EXTABLE(14b,30b) + _ASM_EXTABLE(15b,30b) + _ASM_EXTABLE(16b,30b) + _ASM_EXTABLE(18b,40b) + _ASM_EXTABLE(19b,40b) + _ASM_EXTABLE(21b,50b) + _ASM_EXTABLE(22b,50b) + CFI_ENDPROC +ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S deleted file mode 100644 index 6a4f43c2d9e6..000000000000 --- a/arch/x86/lib/copy_user_nocache_64.S +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com> - * Copyright 2002 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> - -#define FIX_ALIGNMENT 1 - -#include <asm/current.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/asm.h> -#include <asm/smap.h> - - .macro ALIGN_DESTINATION -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(100b,103b) - _ASM_EXTABLE(101b,103b) -#endif - .endm - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination/source out of cache for more performance. - */ -ENTRY(__copy_user_nocache) - CFI_STARTPROC - ASM_STAC - cmpl $8,%edx - jb 20f /* less then 8 bytes, go to byte copy loop */ - ALIGN_DESTINATION - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz 17f -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movnti %r8,(%rdi) -6: movnti %r9,1*8(%rdi) -7: movnti %r10,2*8(%rdi) -8: movnti %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movnti %r8,4*8(%rdi) -14: movnti %r9,5*8(%rdi) -15: movnti %r10,6*8(%rdi) -16: movnti %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz 1b -17: movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz 20f -18: movq (%rsi),%r8 -19: movnti %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz 18b -20: andl %edx,%edx - jz 23f - movl %edx,%ecx -21: movb (%rsi),%al -22: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 21b -23: xorl %eax,%eax - ASM_CLAC - sfence - ret - - .section .fixup,"ax" -30: shll $6,%ecx - addl %ecx,%edx - jmp 60f -40: lea (%rdx,%rcx,8),%rdx - jmp 60f -50: movl %ecx,%edx -60: sfence - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(1b,30b) - _ASM_EXTABLE(2b,30b) - _ASM_EXTABLE(3b,30b) - _ASM_EXTABLE(4b,30b) - _ASM_EXTABLE(5b,30b) - _ASM_EXTABLE(6b,30b) - _ASM_EXTABLE(7b,30b) - _ASM_EXTABLE(8b,30b) - _ASM_EXTABLE(9b,30b) - _ASM_EXTABLE(10b,30b) - _ASM_EXTABLE(11b,30b) - _ASM_EXTABLE(12b,30b) - _ASM_EXTABLE(13b,30b) - _ASM_EXTABLE(14b,30b) - _ASM_EXTABLE(15b,30b) - _ASM_EXTABLE(16b,30b) - _ASM_EXTABLE(18b,40b) - _ASM_EXTABLE(19b,40b) - _ASM_EXTABLE(21b,50b) - _ASM_EXTABLE(22b,50b) - CFI_ENDPROC -ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index c9f2d9ba8dd8..e5e3ed8dc079 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -22,7 +22,7 @@ #include <linux/sched.h> #include <linux/types.h> -#include <asm/i387.h> +#include <asm/fpu/api.h> #include <asm/asm.h> void *_mmx_memcpy(void *to, const void *from, size_t len) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index e2f5e21c03b3..91d93b95bd86 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -647,7 +647,8 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); * @from: Source address, in kernel space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from kernel space to user space. * @@ -668,7 +669,8 @@ EXPORT_SYMBOL(_copy_to_user); * @from: Source address, in user space. * @n: Number of bytes to copy. * - * Context: User context only. This function may sleep. + * Context: User context only. This function may sleep if pagefaults are + * enabled. * * Copy data from user space to kernel space. * diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index dc8adad10a2f..dd76a05729b0 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -30,7 +30,7 @@ static void fclex(void) } /* Needs to be externally visible */ -void finit_soft_fpu(struct i387_soft_struct *soft) +void fpstate_init_soft(struct swregs_state *soft) { struct address *oaddr, *iaddr; memset(soft, 0, sizeof(*soft)); @@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft) void finit(void) { - finit_soft_fpu(¤t->thread.fpu.state->soft); + fpstate_init_soft(¤t->thread.fpu.state.soft); } /* diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 9b868124128d..f37e84ab49f3 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -31,7 +31,7 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/user.h> -#include <asm/i387.h> +#include <asm/fpu/internal.h> #include "fpu_system.h" #include "fpu_emu.h" @@ -147,13 +147,9 @@ void math_emulate(struct math_emu_info *info) unsigned long code_base = 0; unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ struct desc_struct code_descriptor; + struct fpu *fpu = ¤t->thread.fpu; - if (!used_math()) { - if (init_fpu(current)) { - do_group_exit(SIGKILL); - return; - } - } + fpu__activate_curr(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { @@ -673,7 +669,7 @@ void math_abort(struct math_emu_info *info, unsigned int signal) #endif /* PARANOID */ } -#define S387 ((struct i387_soft_struct *)s387) +#define S387 ((struct swregs_state *)s387) #define sstatus_word() \ ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top)) @@ -682,14 +678,14 @@ int fpregs_soft_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct i387_soft_struct *s387 = &target->thread.fpu.state->soft; + struct swregs_state *s387 = &target->thread.fpu.state.soft; void *space = s387->st_space; int ret; int offset, other, i, tags, regnr, tag, newtop; RE_ENTRANT_CHECK_OFF; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0, - offsetof(struct i387_soft_struct, st_space)); + offsetof(struct swregs_state, st_space)); RE_ENTRANT_CHECK_ON; if (ret) @@ -734,7 +730,7 @@ int fpregs_soft_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - struct i387_soft_struct *s387 = &target->thread.fpu.state->soft; + struct swregs_state *s387 = &target->thread.fpu.state.soft; const void *space = s387->st_space; int ret; int offset = (S387->ftop & 7) * 10, other = 80 - offset; @@ -752,7 +748,7 @@ int fpregs_soft_get(struct task_struct *target, #endif /* PECULIAR_486 */ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0, - offsetof(struct i387_soft_struct, st_space)); + offsetof(struct swregs_state, st_space)); /* Copy all registers in stack order. */ if (!ret) diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 2c614410a5f3..9ccecb61a4fa 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -31,7 +31,7 @@ #define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ == (1 << 10)) -#define I387 (current->thread.fpu.state) +#define I387 (¤t->thread.fpu.state) #define FPU_info (I387->soft.info) #define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 181c53bac3a7..9dc909841739 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -13,6 +13,7 @@ #include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ +#include <linux/uaccess.h> /* faulthandler_disabled() */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -1126,9 +1127,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, /* * If we're in an interrupt, have no user context or are running - * in an atomic region then we must not take the fault: + * in a region with pagefaults disabled then we must not take the fault */ - if (unlikely(in_atomic() || !mm)) { + if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 4500142bc4aa..eecb207a2037 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) unsigned long vaddr; int idx, type; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) @@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr) #endif pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 9ca35fc60cfe..2b7ece0e103a 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -59,6 +59,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) unsigned long vaddr; int idx, type; + preempt_disable(); pagefault_disable(); type = kmap_atomic_idx_push(); @@ -117,5 +118,6 @@ iounmap_atomic(void __iomem *kvaddr) } pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 70e7444c6835..f8ced3d36eb7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -237,7 +237,8 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use - * UC MINUS. + * UC MINUS. Drivers that are certain they need or can already + * be converted over to strong UC can use ioremap_uc(). */ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; @@ -247,6 +248,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) EXPORT_SYMBOL(ioremap_nocache); /** + * ioremap_uc - map bus memory into CPU space as strongly uncachable + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * ioremap_uc performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked with a strong + * preference as completely uncachable on the CPU when possible. For non-PAT + * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT + * systems this will set the PAT entry for the pages as strong UC. This call + * will honor existing caching rules from things like the PCI bus. Note that + * there are other caches and buffers on many busses. In particular driver + * authors should read up on PCI writes. + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; + + return __ioremap_caller(phys_addr, size, pcm, + __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(ioremap_uc); + +/** * ioremap_wc - map memory into CPU space write combined * @phys_addr: bus address of the memory * @size: size of the resource to map @@ -353,18 +387,18 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) { unsigned long start = phys & PAGE_MASK; unsigned long offset = phys & ~PAGE_MASK; - unsigned long vaddr; + void *vaddr; /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE); + vaddr = ioremap_cache(start, PAGE_SIZE); /* Only add the offset on success and return NULL if the ioremap() failed: */ if (vaddr) vaddr += offset; - return (void *)vaddr; + return vaddr; } void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) @@ -373,7 +407,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) return; iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); - return; } static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index c439ec478216..2e0dfd39bd22 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -10,13 +10,12 @@ #include <linux/syscalls.h> #include <linux/sched/sysctl.h> -#include <asm/i387.h> #include <asm/insn.h> #include <asm/mman.h> #include <asm/mmu_context.h> #include <asm/mpx.h> #include <asm/processor.h> -#include <asm/fpu-internal.h> +#include <asm/fpu/internal.h> static const char *mpx_mapping_name(struct vm_area_struct *vma) { @@ -273,7 +272,7 @@ bad_opcode: * The caller is expected to kfree() the returned siginfo_t. */ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xsave_struct *xsave_buf) + struct xregs_state *xsave_buf) { struct bndreg *bndregs, *bndreg; siginfo_t *info = NULL; @@ -358,8 +357,8 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk) * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ - fpu_save_init(&tsk->thread.fpu); - bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); + copy_fpregs_to_fpstate(&tsk->thread.fpu); + bndcsr = get_xsave_addr(&tsk->thread.fpu.state.xsave, XSTATE_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; @@ -390,7 +389,7 @@ int mpx_enable_management(struct task_struct *tsk) * directory into XSAVE/XRSTOR Save Area and enable MPX through * XRSTOR instruction. * - * fpu_xsave() is expected to be very expensive. Storing the bounds + * copy_xregs_to_kernel() is expected to be very expensive. Storing the bounds * directory here means that we do not have to do xsave in the unmap * path; we can just use mm->bd_addr instead. */ @@ -498,7 +497,7 @@ out_unmap: * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, * and the size of each bound table is 4MB. */ -static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) +static int do_mpx_bt_fault(struct xregs_state *xsave_buf) { unsigned long bd_entry, bd_base; struct bndcsr *bndcsr; @@ -526,7 +525,7 @@ static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) return allocate_bt((long __user *)bd_entry); } -int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +int mpx_handle_bd_fault(struct xregs_state *xsave_buf) { /* * Userspace never asked us to manage the bounds tables, diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 89af288ec674..397838eb292b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -129,16 +129,15 @@ within(unsigned long addr, unsigned long start, unsigned long end) */ void clflush_cache_range(void *vaddr, unsigned int size) { - void *vend = vaddr + size - 1; + unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; mb(); - for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) - clflushopt(vaddr); - /* - * Flush any possible final partial cacheline: - */ - clflushopt(vend); + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += boot_cpu_data.x86_clflush_size) + clflushopt(p); mb(); } @@ -418,13 +417,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr) phys_addr_t phys_addr; unsigned long offset; enum pg_level level; - unsigned long psize; unsigned long pmask; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); - psize = page_level_size(level); pmask = page_level_mask(level); offset = virt_addr & ~pmask; phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; @@ -1468,6 +1465,9 @@ int _set_memory_uc(unsigned long addr, int numpages) { /* * for now UC MINUS. see comments in ioremap_nocache() + * If you really need strong UC use ioremap_uc(), but note + * that you cannot override IO areas with set_memory_*() as + * these helpers cannot work with IO memory. */ return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 852aa4c92da0..27062303c881 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -208,6 +208,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, static int intel_mid_pci_irq_enable(struct pci_dev *dev) { + struct irq_alloc_info info; int polarity; if (dev->irq_managed && dev->irq > 0) @@ -217,14 +218,13 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) polarity = 0; /* active high */ else polarity = 1; /* active low */ + ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity); /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev))) - return -EBUSY; - if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0) + if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0) return -EBUSY; dev->irq_managed = 1; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 5dc6ca5e1741..9bd115484745 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -146,19 +146,20 @@ static void __init pirq_peer_trick(void) /* * Code for querying and setting of IRQ routes on various interrupt routers. + * PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1. */ -void eisa_set_level_irq(unsigned int irq) +void elcr_set_level_irq(unsigned int irq) { unsigned char mask = 1 << (irq & 7); unsigned int port = 0x4d0 + (irq >> 3); unsigned char val; - static u16 eisa_irq_mask; + static u16 elcr_irq_mask; - if (irq >= 16 || (1 << irq) & eisa_irq_mask) + if (irq >= 16 || (1 << irq) & elcr_irq_mask) return; - eisa_irq_mask |= (1 << irq); + elcr_irq_mask |= (1 << irq); printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq); val = inb(port); if (!(val & mask)) { @@ -965,11 +966,11 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { msg = "found"; - eisa_set_level_irq(irq); + elcr_set_level_irq(irq); } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { if (r->set(pirq_router_dev, dev, pirq, newirq)) { - eisa_set_level_irq(newirq); + elcr_set_level_irq(newirq); msg = "assigned"; irq = newirq; } diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index a62e0be3a2f1..f1a6c8e86ddd 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,4 +1,5 @@ # Platform specific code goes here +obj-y += atom/ obj-y += ce4100/ obj-y += efi/ obj-y += geode/ diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile new file mode 100644 index 000000000000..0a3a40cbc794 --- /dev/null +++ b/arch/x86/platform/atom/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c new file mode 100644 index 000000000000..5ca8ead91579 --- /dev/null +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -0,0 +1,183 @@ +/* + * Intel SOC Punit device state debug driver + * Punit controls power management for North Complex devices (Graphics + * blocks, Image Signal Processing, video processing, display, DSP etc.) + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/device.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/io.h> +#include <asm/cpu_device_id.h> +#include <asm/iosf_mbi.h> + +/* Side band Interface port */ +#define PUNIT_PORT 0x04 +/* Power gate status reg */ +#define PWRGT_STATUS 0x61 +/* Subsystem config/status Video processor */ +#define VED_SS_PM0 0x32 +/* Subsystem config/status ISP (Image Signal Processor) */ +#define ISP_SS_PM0 0x39 +/* Subsystem config/status Input/output controller */ +#define MIO_SS_PM 0x3B +/* Shift bits for getting status for video, isp and i/o */ +#define SSS_SHIFT 24 +/* Shift bits for getting status for graphics rendering */ +#define RENDER_POS 0 +/* Shift bits for getting status for media control */ +#define MEDIA_POS 2 +/* Shift bits for getting status for Valley View/Baytrail display */ +#define VLV_DISPLAY_POS 6 +/* Subsystem config/status display for Cherry Trail SOC */ +#define CHT_DSP_SSS 0x36 +/* Shift bits for getting status for display */ +#define CHT_DSP_SSS_POS 16 + +struct punit_device { + char *name; + int reg; + int sss_pos; +}; + +static const struct punit_device punit_device_byt[] = { + { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, + { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, + { "DISPLAY", PWRGT_STATUS, VLV_DISPLAY_POS }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + +static const struct punit_device punit_device_cht[] = { + { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, + { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, + { "DISPLAY", CHT_DSP_SSS, CHT_DSP_SSS_POS }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + +static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"}; + +static int punit_dev_state_show(struct seq_file *seq_file, void *unused) +{ + u32 punit_pwr_status; + struct punit_device *punit_devp = seq_file->private; + int index; + int status; + + seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n"); + while (punit_devp->name) { + status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ, + punit_devp->reg, + &punit_pwr_status); + if (status) { + seq_printf(seq_file, "%9s : Read Failed\n", + punit_devp->name); + } else { + index = (punit_pwr_status >> punit_devp->sss_pos) & 3; + seq_printf(seq_file, "%9s : %s\n", punit_devp->name, + dstates[index]); + } + punit_devp++; + } + + return 0; +} + +static int punit_dev_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, punit_dev_state_show, inode->i_private); +} + +static const struct file_operations punit_dev_state_ops = { + .open = punit_dev_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *punit_dbg_file; + +static int punit_dbgfs_register(struct punit_device *punit_device) +{ + static struct dentry *dev_state; + + punit_dbg_file = debugfs_create_dir("punit_atom", NULL); + if (!punit_dbg_file) + return -ENXIO; + + dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO, + punit_dbg_file, punit_device, + &punit_dev_state_ops); + if (!dev_state) { + pr_err("punit_dev_state register failed\n"); + debugfs_remove(punit_dbg_file); + return -ENXIO; + } + + return 0; +} + +static void punit_dbgfs_unregister(void) +{ + debugfs_remove_recursive(punit_dbg_file); +} + +#define ICPU(model, drv_data) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\ + (kernel_ulong_t)&drv_data } + +static const struct x86_cpu_id intel_punit_cpu_ids[] = { + ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */ + ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */ + {} +}; + +MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids); + +static int __init punit_atom_debug_init(void) +{ + const struct x86_cpu_id *id; + int ret; + + id = x86_match_cpu(intel_punit_cpu_ids); + if (!id) + return -ENODEV; + + ret = punit_dbgfs_register((struct punit_device *)id->driver_data); + if (ret < 0) + return ret; + + return 0; +} + +static void __exit punit_atom_debug_exit(void) +{ + punit_dbgfs_unregister(); +} + +module_init(punit_atom_debug_init); +module_exit(punit_atom_debug_exit); + +MODULE_AUTHOR("Kumar P, Mahesh <mahesh.kumar.p@intel.com>"); +MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>"); +MODULE_DESCRIPTION("Driver for Punit devices states debugging"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c index 0b283d4d0ad7..de734134bc8d 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c @@ -27,6 +27,7 @@ static struct platform_device wdt_dev = { static int tangier_probe(struct platform_device *pdev) { int gsi; + struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; if (!pdata) @@ -34,8 +35,8 @@ static int tangier_probe(struct platform_device *pdev) /* IOAPIC builds identity mapping between GSI and IRQ on MID */ gsi = pdata->irq; - if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) || - mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) { + ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); + if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); return -EINVAL; diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 3005f0c89f2e..01d54ea766c1 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -81,26 +81,34 @@ static unsigned long __init intel_mid_calibrate_tsc(void) return 0; } +static void __init intel_mid_setup_bp_timer(void) +{ + apbt_time_init(); + setup_boot_APIC_clock(); +} + static void __init intel_mid_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + switch (intel_mid_timer_options) { case INTEL_MID_TIMER_APBT_ONLY: break; case INTEL_MID_TIMER_LAPIC_APBT: - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + /* Use apbt and local apic */ + x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer; x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - break; + return; default: if (!boot_cpu_has(X86_FEATURE_ARAT)) break; + /* Lapic only, no apbt */ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; return; } - /* we need at least one APB timer */ - pre_init_apic_IRQ0(); - apbt_time_init(); + + x86_init.timers.setup_percpu_clockev = apbt_time_init; } static void intel_mid_arch_setup(void) diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index c14ad34776c4..ce992e8cc065 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -95,18 +95,16 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->freq_hz, pentry->irq); - if (!pentry->irq) - continue; - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; -/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ - mp_irq.irqflag = 5; - mp_irq.srcbus = MP_BUS_ISA; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = MP_BUS_ISA; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + mp_save_irq(&mp_irq); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); } return 0; @@ -177,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); } return 0; } @@ -436,6 +434,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) struct devs_id *dev = NULL; int num, i, ret; int polarity; + struct irq_alloc_info info; sb = (struct sfi_table_simple *)table; num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); @@ -469,9 +468,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) polarity = 1; } - ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE); - if (ret == 0) - ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC); + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity); + ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info); WARN_ON(ret < 0); } diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index 2a8a74f3bd76..6c7111bbd1e9 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -25,8 +25,8 @@ #include <linux/init.h> #include <linux/sfi.h> #include <linux/io.h> -#include <linux/irqdomain.h> +#include <asm/irqdomain.h> #include <asm/io_apic.h> #include <asm/mpspec.h> #include <asm/setup.h> @@ -71,9 +71,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC -static struct irq_domain_ops sfi_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, -}; static int __init sfi_parse_ioapic(struct sfi_table_header *table) { @@ -82,7 +79,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table) int i, num; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_STRICT, - .ops = &sfi_ioapic_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; sb = (struct sfi_table_simple *)table; diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 0ce673645432..8570abe68be1 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -13,22 +13,37 @@ #include <linux/slab.h> #include <linux/irq.h> +#include <asm/irqdomain.h> #include <asm/apic.h> #include <asm/uv/uv_irq.h> #include <asm/uv/uv_hub.h> /* MMR offset and pnode of hub sourcing interrupts for a given irq */ -struct uv_irq_2_mmr_pnode{ - struct rb_node list; +struct uv_irq_2_mmr_pnode { unsigned long offset; int pnode; - int irq; }; -static DEFINE_SPINLOCK(uv_irq_lock); -static struct rb_root uv_irq_root; +static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = cfg->dest_apicid; -static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); + uv_write_global_mmr64(info->pnode, info->offset, mmr_value); +} static void uv_noop(struct irq_data *data) { } @@ -37,6 +52,23 @@ static void uv_ack_apic(struct irq_data *data) ack_APIC_irq(); } +static int +uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_data *parent = data->parent_data; + struct irq_cfg *cfg = irqd_cfg(data); + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + uv_program_mmr(cfg, data->chip_data); + send_cleanup_vector(cfg); + } + + return ret; +} + static struct irq_chip uv_irq_chip = { .name = "UV-CORE", .irq_mask = uv_noop, @@ -45,189 +77,99 @@ static struct irq_chip uv_irq_chip = { .irq_set_affinity = uv_set_irq_affinity, }; -/* - * Add offset and pnode information of the hub sourcing interrupts to the - * rb tree for a specific irq. - */ -static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - struct rb_node **link = &uv_irq_root.rb_node; - struct rb_node *parent = NULL; - struct uv_irq_2_mmr_pnode *n; - struct uv_irq_2_mmr_pnode *e; - unsigned long irqflags; - - n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, - uv_blade_to_memory_nid(blade)); - if (!n) + struct uv_irq_2_mmr_pnode *chip_data; + struct irq_alloc_info *info = arg; + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + int ret; + + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_UV) + return -EINVAL; + + chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL, + irq_data->node); + if (!chip_data) return -ENOMEM; - n->irq = irq; - n->offset = offset; - n->pnode = uv_blade_to_pnode(blade); - spin_lock_irqsave(&uv_irq_lock, irqflags); - /* Find the right place in the rbtree: */ - while (*link) { - parent = *link; - e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); - - if (unlikely(irq == e->irq)) { - /* irq entry exists */ - e->pnode = uv_blade_to_pnode(blade); - e->offset = offset; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - kfree(n); - return 0; - } - - if (irq < e->irq) - link = &(*link)->rb_left; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + if (info->uv_limit == UV_AFFINITY_CPU) + irq_set_status_flags(virq, IRQ_NO_BALANCING); else - link = &(*link)->rb_right; + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + + chip_data->pnode = uv_blade_to_pnode(info->uv_blade); + chip_data->offset = info->uv_offset; + irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data, + handle_percpu_irq, NULL, info->uv_name); + } else { + kfree(chip_data); } - /* Insert the node into the rbtree. */ - rb_link_node(&n->list, parent, link); - rb_insert_color(&n->list, &uv_irq_root); - - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; + return ret; } -/* Retrieve offset and pnode information from the rb tree for a specific irq */ -int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +static void uv_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - - if (e->irq == irq) { - *offset = e->offset; - *pnode = e->pnode; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; - } - - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return -1; + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_clear_status_flags(virq, IRQ_NO_BALANCING); + irq_domain_free_irqs_top(domain, virq, nr_irqs); } /* * Re-target the irq to the specified CPU and enable the specified MMR located * on the specified blade to allow the sending of MSIs to the specified CPU. */ -static int -arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset, int limit) +static void uv_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode, err; - unsigned int dest; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); - if (err != 0) - return err; - - if (limit == UV_AFFINITY_CPU) - irq_set_status_flags(irq, IRQ_NO_BALANCING); - else - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - - irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; + uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); } /* * Disable the specified MMR located on the specified blade so that MSIs are * longer allowed to be sent. */ -static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +static void uv_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) { unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->mask = 1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); } -static int -uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; - unsigned long mmr_value, mmr_offset; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - if (apic_set_affinity(data, mask, &dest)) - return -1; - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - /* Get previously stored MMR and pnode of hub sourcing interrupts */ - if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode)) - return -1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +static const struct irq_domain_ops uv_domain_ops = { + .alloc = uv_domain_alloc, + .free = uv_domain_free, + .activate = uv_domain_activate, + .deactivate = uv_domain_deactivate, +}; - if (cfg->move_in_progress) - send_cleanup_vector(cfg); +static struct irq_domain *uv_get_irq_domain(void) +{ + static struct irq_domain *uv_domain; + static DEFINE_MUTEX(uv_lock); + + mutex_lock(&uv_lock); + if (uv_domain == NULL) { + uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL); + if (uv_domain) + uv_domain->parent = x86_vector_domain; + } + mutex_unlock(&uv_lock); - return IRQ_SET_MASK_OK_NOCOPY; + return uv_domain; } /* @@ -238,19 +180,21 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { - int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade)); + struct irq_alloc_info info; + struct irq_domain *domain = uv_get_irq_domain(); - if (!irq) - return -EBUSY; + if (!domain) + return -ENOMEM; - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, - limit); - if (ret == irq) - uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); - else - irq_free_hwirq(irq); + init_irq_alloc_info(&info, cpumask_of(cpu)); + info.type = X86_IRQ_ALLOC_TYPE_UV; + info.uv_limit = limit; + info.uv_blade = mmr_blade; + info.uv_offset = mmr_offset; + info.uv_name = irq_name; - return ret; + return irq_domain_alloc_irqs(domain, 1, + uv_blade_to_memory_nid(mmr_blade), &info); } EXPORT_SYMBOL_GPL(uv_setup_irq); @@ -263,26 +207,6 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); */ void uv_teardown_irq(unsigned int irq) { - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - if (e->irq == irq) { - arch_disable_uv_irq(e->pnode, e->offset); - rb_erase(n, &uv_irq_root); - kfree(e); - break; - } - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); } EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 757678fb26e1..0d7dd1f5ac36 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -18,10 +18,9 @@ #include <asm/mtrr.h> #include <asm/page.h> #include <asm/mce.h> -#include <asm/xcr.h> #include <asm/suspend.h> +#include <asm/fpu/internal.h> #include <asm/debugreg.h> -#include <asm/fpu-internal.h> /* pcntxt_mask */ #include <asm/cpu.h> #ifdef CONFIG_X86_32 @@ -155,6 +154,8 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_LDT(¤t->active_mm->context); /* This does lldt */ + + fpu__resume_cpu(); } /** @@ -221,12 +222,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); #endif - /* - * restore XCR0 for xsave capable cpu's. - */ - if (cpu_has_xsave) - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); - fix_processor_context(); do_fpu_end(); diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 3c4469a7a929..e2386cb4e0c3 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -78,9 +78,9 @@ ENTRY(restore_image) /* code below has been relocated to a safe page */ ENTRY(core_restore_code) -loop: +.Lloop: testq %rdx, %rdx - jz done + jz .Ldone /* get addresses from the pbe and copy the page */ movq pbe_address(%rdx), %rsi @@ -91,8 +91,8 @@ loop: /* progress to the next pbe */ movq pbe_next(%rdx), %rdx - jmp loop -done: + jmp .Lloop +.Ldone: /* jump to the restore_registers address from the image header */ jmpq *%rax /* diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 7e8a1a650435..b9531d343134 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -39,7 +39,8 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) + +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 46957ead3060..37058679d148 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1181,10 +1181,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_tscp = native_read_tscp, .iret = xen_iret, - .irq_enable_sysexit = xen_sysexit, #ifdef CONFIG_X86_64 .usergs_sysret32 = xen_sysret32, .usergs_sysret64 = xen_sysret64, +#else + .irq_enable_sysexit = xen_sysexit, #endif .load_tr_desc = paravirt_nop, @@ -1423,7 +1424,7 @@ static void xen_pvh_set_cr_flags(int cpu) return; /* * For BSP, PSE PGE are set in probe_page_size_mask(), for APs - * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init. + * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). */ if (cpu_has_pse) cr4_set_bits_and_update_boot(X86_CR4_PSE); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 956374c1edbc..9e2ba5c6e1dd 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,6 +17,56 @@ #include "xen-ops.h" #include "debugfs.h" +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(char *, irq_name); +static bool xen_pvspin = true; + +#ifdef CONFIG_QUEUED_SPINLOCKS + +#include <asm/qspinlock.h> + +static void xen_qlock_kick(int cpu) +{ + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); +} + +/* + * Halt the current CPU & release it back to the host + */ +static void xen_qlock_wait(u8 *byte, u8 val) +{ + int irq = __this_cpu_read(lock_kicker_irq); + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return; + + /* clear pending */ + xen_clear_irq_pending(irq); + barrier(); + + /* + * We check the byte value after clearing pending IRQ to make sure + * that we won't miss a wakeup event because of the clearing. + * + * The sync_clear_bit() call in xen_clear_irq_pending() is atomic. + * So it is effectively a memory barrier for x86. + */ + if (READ_ONCE(*byte) != val) + return; + + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ + + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); +} + +#else /* CONFIG_QUEUED_SPINLOCKS */ + enum xen_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -100,12 +150,9 @@ struct xen_lock_waiting { __ticket_t want; }; -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); static cpumask_t waiting_cpus; -static bool xen_pvspin = true; __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); @@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) } } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ static irqreturn_t dummy_handler(int irq, void *dev_id) { @@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void) return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); +#ifdef CONFIG_QUEUED_SPINLOCKS + __pv_init_lock_hash(); + pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_lock_ops.wait = xen_qlock_wait; + pv_lock_ops.kick = xen_qlock_kick; +#else pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; +#endif } /* @@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg) } early_param("xen_nopvspin", xen_parse_nopvspin); -#ifdef CONFIG_XEN_DEBUG_FS +#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS) static struct dentry *d_spin_debug; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 985fc3ee0973..04529e620559 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,8 @@ #include <asm/percpu.h> #include <asm/processor-flags.h> #include <asm/segment.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> #include <xen/interface/xen.h> @@ -47,29 +49,13 @@ ENTRY(xen_iret) ENDPATCH(xen_iret) RELOC(xen_iret, 1b+1) -/* - * sysexit is not used for 64-bit processes, so it's only ever used to - * return to 32-bit compat userspace. - */ -ENTRY(xen_sysexit) - pushq $__USER32_DS - pushq %rcx - pushq $X86_EFLAGS_IF - pushq $__USER32_CS - pushq %rdx - - pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_sysexit) -RELOC(xen_sysexit, 1b+1) - ENTRY(xen_sysret64) /* * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER_DS pushq PER_CPU_VAR(rsp_scratch) @@ -88,7 +74,7 @@ ENTRY(xen_sysret32) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER32_DS pushq PER_CPU_VAR(rsp_scratch) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9e195c683549..c20fe29e65f4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -134,7 +134,9 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); +#ifdef CONFIG_X86_32 __visible void xen_sysexit(void); +#endif __visible void xen_sysret32(void); __visible void xen_sysret64(void); __visible void xen_adjust_exception_frame(void); diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 9e3571a6535c..83a44a33cfa1 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -15,10 +15,10 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/hardirq.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/hardirq.h> -#include <asm/uaccess.h> #include <asm/pgalloc.h> DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST; @@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs) /* If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) { + if (faulthandler_disabled() || !mm) { bad_page_fault(regs, address, SIGSEGV); return; } diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 8cfb71ec0937..184ceadccc1a 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page) enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -79,6 +80,7 @@ void __kunmap_atomic(void *kvaddr) } pagefault_enable(); + preempt_enable(); } EXPORT_SYMBOL(__kunmap_atomic); |