aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-02-13 09:45:09 +0100
committerIngo Molnar <mingo@elte.hu>2009-02-13 09:45:09 +0100
commitab639f3593f0b5e4439d549831442c18c3baf989 (patch)
tree118743e94e5dc86c835dbc1f1d3bf1612f4ae740 /arch/x86
parentf8a6b2b9cee298a9663cbe38ce1eb5240987cb62 (diff)
parent58105ef1857112a186696c9b8957020090226a28 (diff)
Merge branch 'core/percpu' into x86/core
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/Makefile17
-rw-r--r--arch/x86/include/asm/a.out-core.h2
-rw-r--r--arch/x86/include/asm/elf.h15
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/percpu.h22
-rw-r--r--arch/x86/include/asm/processor.h6
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/segment.h9
-rw-r--r--arch/x86/include/asm/stackprotector.h96
-rw-r--r--arch/x86/include/asm/syscalls.h20
-rw-r--r--arch/x86/include/asm/system.h38
-rw-r--r--arch/x86/include/asm/traps.h2
-rw-r--r--arch/x86/include/asm/uaccess.h33
-rw-r--r--arch/x86/include/asm/uv/uv.h3
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/cpu/common.c23
-rw-r--r--arch/x86/kernel/entry_32.S443
-rw-r--r--arch/x86/kernel/head_32.S21
-rw-r--r--arch/x86/kernel/head_64.S21
-rw-r--r--arch/x86/kernel/ioport.c3
-rw-r--r--arch/x86/kernel/process_32.c53
-rw-r--r--arch/x86/kernel/process_64.c11
-rw-r--r--arch/x86/kernel/ptrace.c19
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/signal.c75
-rw-r--r--arch/x86/kernel/syscall_table_32.S20
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/vm86_32.c20
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S8
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/math-emu/get_address.c6
-rw-r--r--arch/x86/mm/numa_64.c4
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c17
-rw-r--r--arch/x86/xen/xen-asm.S78
-rw-r--r--arch/x86/xen/xen-asm_32.S238
-rw-r--r--arch/x86/xen/xen-asm_64.S107
38 files changed, 888 insertions, 569 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 148c112c9ca..1042d69b267 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -194,6 +194,10 @@ config X86_TRAMPOLINE
depends on SMP || (64BIT && ACPI_SLEEP)
default y
+config X86_32_LAZY_GS
+ def_bool y
+ depends on X86_32 && !CC_STACKPROTECTOR
+
config KTIME_SCALAR
def_bool X86_32
source "init/Kconfig"
@@ -1339,7 +1343,6 @@ config CC_STACKPROTECTOR_ALL
config CC_STACKPROTECTOR
bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- depends on X86_64
select CC_STACKPROTECTOR_ALL
---help---
This option turns on the -fstack-protector GCC feature. This
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 99550c40799..1836191839e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -70,14 +70,17 @@ else
# this works around some issues with generating unwind tables in older gccs
# newer gccs do it by default
KBUILD_CFLAGS += -maccumulate-outgoing-args
+endif
- stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
- stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
- "$(CC)" "-fstack-protector -DGCC_HAS_SP" )
- stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
- "$(CC)" -fstack-protector-all )
-
- KBUILD_CFLAGS += $(stackp-y)
+ifdef CONFIG_CC_STACKPROTECTOR
+ cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
+ ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y)
+ stackp-y := -fstack-protector
+ stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
+ KBUILD_CFLAGS += $(stackp-y)
+ else
+ $(warning stack protector enabled but no compiler support)
+ endif
endif
# Stackpointer is addressed different for 32 bit and 64 bit x86
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index 3c601f8224b..bb70e397aa8 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -55,7 +55,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
dump->regs.ds = (u16)regs->ds;
dump->regs.es = (u16)regs->es;
dump->regs.fs = (u16)regs->fs;
- savesegment(gs, dump->regs.gs);
+ dump->regs.gs = get_user_gs(regs);
dump->regs.orig_ax = regs->orig_ax;
dump->regs.ip = regs->ip;
dump->regs.cs = (u16)regs->cs;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f51a3ddde01..83c1bc8d2e8 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -112,7 +112,7 @@ extern unsigned int vdso_enabled;
* now struct_user_regs, they are different)
*/
-#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \
do { \
pr_reg[0] = regs->bx; \
pr_reg[1] = regs->cx; \
@@ -124,7 +124,6 @@ do { \
pr_reg[7] = regs->ds & 0xffff; \
pr_reg[8] = regs->es & 0xffff; \
pr_reg[9] = regs->fs & 0xffff; \
- savesegment(gs, pr_reg[10]); \
pr_reg[11] = regs->orig_ax; \
pr_reg[12] = regs->ip; \
pr_reg[13] = regs->cs & 0xffff; \
@@ -133,6 +132,18 @@ do { \
pr_reg[16] = regs->ss & 0xffff; \
} while (0);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+ pr_reg[10] = get_user_gs(regs); \
+} while (0);
+
+#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \
+do { \
+ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+ savesegment(gs, pr_reg[10]); \
+} while (0);
+
#define ELF_PLATFORM (utsname()->machine)
#define set_personality_64bit() do { } while (0)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 52948df9cd1..f923203dc39 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -79,7 +79,7 @@ do { \
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
- loadsegment(gs, 0); \
+ lazy_load_gs(0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 0b64af4f13a..aee103b26d0 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -34,6 +34,12 @@
#define PER_CPU_VAR(var) per_cpu__##var
#endif /* SMP */
+#ifdef CONFIG_X86_64_SMP
+#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
+#else
+#define INIT_PER_CPU_VAR(var) per_cpu__##var
+#endif
+
#else /* ...!ASSEMBLY */
#include <linux/stringify.h>
@@ -45,6 +51,22 @@
#define __percpu_arg(x) "%" #x
#endif
+/*
+ * Initialized pointers to per-cpu variables needed for the boot
+ * processor need to use these macros to get the proper address
+ * offset from __per_cpu_load on SMP.
+ *
+ * There also must be an entry in vmlinux_64.lds.S
+ */
+#define DECLARE_INIT_PER_CPU(var) \
+ extern typeof(per_cpu_var(var)) init_per_cpu_var(var)
+
+#ifdef CONFIG_X86_64_SMP
+#define init_per_cpu_var(var) init_per_cpu__##var
+#else
+#define init_per_cpu_var(var) per_cpu_var(var)
+#endif
+
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
extern void __bad_percpu_size(void);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a6643f68fbb..a0133838b67 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -393,8 +393,14 @@ union irq_stack_union {
};
DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
+DECLARE_INIT_PER_CPU(irq_stack_union);
+
DECLARE_PER_CPU(char *, irq_stack_ptr);
+#else /* X86_64 */
+#ifdef CONFIG_CC_STACKPROTECTOR
+DECLARE_PER_CPU(unsigned long, stack_canary);
#endif
+#endif /* X86_64 */
extern void print_cpu_info(struct cpuinfo_x86 *);
extern unsigned int xstate_size;
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6d34d954c22..e304b66abee 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -28,7 +28,7 @@ struct pt_regs {
int xds;
int xes;
int xfs;
- /* int gs; */
+ int xgs;
long orig_eax;
long eip;
int xcs;
@@ -50,7 +50,7 @@ struct pt_regs {
unsigned long ds;
unsigned long es;
unsigned long fs;
- /* int gs; */
+ unsigned long gs;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 1dc1b51ac62..14e0ed86a6f 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -61,7 +61,7 @@
*
* 26 - ESPFIX small SS
* 27 - per-cpu [ offset to per-cpu data area ]
- * 28 - unused
+ * 28 - stack_canary-20 [ for stack protector ]
* 29 - unused
* 30 - unused
* 31 - TSS for double fault handler
@@ -95,6 +95,13 @@
#define __KERNEL_PERCPU 0
#endif
+#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16)
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8)
+#else
+#define __KERNEL_STACK_CANARY 0
+#endif
+
#define GDT_ENTRY_DOUBLEFAULT_TSS 31
/*
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 36a700acaf2..c2d742c6e15 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -1,8 +1,54 @@
+/*
+ * GCC stack protector support.
+ *
+ * Stack protector works by putting predefined pattern at the start of
+ * the stack frame and verifying that it hasn't been overwritten when
+ * returning from the function. The pattern is called stack canary
+ * and unfortunately gcc requires it to be at a fixed offset from %gs.
+ * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64
+ * and x86_32 use segment registers differently and thus handles this
+ * requirement differently.
+ *
+ * On x86_64, %gs is shared by percpu area and stack canary. All
+ * percpu symbols are zero based and %gs points to the base of percpu
+ * area. The first occupant of the percpu area is always
+ * irq_stack_union which contains stack_canary at offset 40. Userland
+ * %gs is always saved and restored on kernel entry and exit using
+ * swapgs, so stack protector doesn't add any complexity there.
+ *
+ * On x86_32, it's slightly more complicated. As in x86_64, %gs is
+ * used for userland TLS. Unfortunately, some processors are much
+ * slower at loading segment registers with different value when
+ * entering and leaving the kernel, so the kernel uses %fs for percpu
+ * area and manages %gs lazily so that %gs is switched only when
+ * necessary, usually during task switch.
+ *
+ * As gcc requires the stack canary at %gs:20, %gs can't be managed
+ * lazily if stack protector is enabled, so the kernel saves and
+ * restores userland %gs on kernel entry and exit. This behavior is
+ * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
+ * system.h to hide the details.
+ */
+
#ifndef _ASM_STACKPROTECTOR_H
#define _ASM_STACKPROTECTOR_H 1
+#ifdef CONFIG_CC_STACKPROTECTOR
+
#include <asm/tsc.h>
#include <asm/processor.h>
+#include <asm/percpu.h>
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <linux/random.h>
+
+/*
+ * 24 byte read-only segment initializer for stack canary. Linker
+ * can't handle the address bit shifting. Address will be set in
+ * head_32 for boot CPU and setup_per_cpu_areas() for others.
+ */
+#define GDT_STACK_CANARY_INIT \
+ [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } },
/*
* Initialize the stackprotector canary value.
@@ -15,12 +61,9 @@ static __always_inline void boot_init_stack_canary(void)
u64 canary;
u64 tsc;
- /*
- * Build time only check to make sure the stack_canary is at
- * offset 40 in the pda; this is a gcc ABI requirement
- */
+#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
-
+#endif
/*
* We both use the random pool and the current TSC as a source
* of randomness. The TSC only matters for very early init,
@@ -32,7 +75,50 @@ static __always_inline void boot_init_stack_canary(void)
canary += tsc + (tsc << 32UL);
current->stack_canary = canary;
+#ifdef CONFIG_X86_64
percpu_write(irq_stack_union.stack_canary, canary);
+#else
+ percpu_write(stack_canary, canary);
+#endif
}
+static inline void setup_stack_canary_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20;
+ struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
+ struct desc_struct desc;
+
+ desc = gdt_table[GDT_ENTRY_STACK_CANARY];
+ desc.base0 = canary & 0xffff;
+ desc.base1 = (canary >> 16) & 0xff;
+ desc.base2 = (canary >> 24) & 0xff;
+ write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S);
+#endif
+}
+
+static inline void load_stack_canary_segment(void)
+{
+#ifdef CONFIG_X86_32
+ asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory");
+#endif
+}
+
+#else /* CC_STACKPROTECTOR */
+
+#define GDT_STACK_CANARY_INIT
+
+/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */
+
+static inline void setup_stack_canary_segment(int cpu)
+{ }
+
+static inline void load_stack_canary_segment(void)
+{
+#ifdef CONFIG_X86_32
+ asm volatile ("mov %0, %%gs" : : "r" (0));
#endif
+}
+
+#endif /* CC_STACKPROTECTOR */
+#endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index c0b0bda754e..68b1be10cfa 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -29,21 +29,21 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
/* X86_32 only */
#ifdef CONFIG_X86_32
/* kernel/process_32.c */
-asmlinkage int sys_fork(struct pt_regs);
-asmlinkage int sys_clone(struct pt_regs);
-asmlinkage int sys_vfork(struct pt_regs);
-asmlinkage int sys_execve(struct pt_regs);
+int sys_fork(struct pt_regs *);
+int sys_clone(struct pt_regs *);
+int sys_vfork(struct pt_regs *);
+int sys_execve(struct pt_regs *);
/* kernel/signal_32.c */
asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
struct old_sigaction __user *);
-asmlinkage int sys_sigaltstack(unsigned long);
-asmlinkage unsigned long sys_sigreturn(unsigned long);
-asmlinkage int sys_rt_sigreturn(unsigned long);
+int sys_sigaltstack(struct pt_regs *);
+unsigned long sys_sigreturn(struct pt_regs *);
+long sys_rt_sigreturn(struct pt_regs *);
/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned long);
+long sys_iopl(struct pt_regs *);
/* kernel/sys_i386_32.c */
asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
@@ -59,8 +59,8 @@ struct oldold_utsname;
asmlinkage int sys_olduname(struct oldold_utsname __user *);
/* kernel/vm86_32.c */
-asmlinkage int sys_vm86old(struct pt_regs);
-asmlinkage int sys_vm86(struct pt_regs);
+int sys_vm86old(struct pt_regs *);
+int sys_vm86(struct pt_regs *);
#else /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index c22383743f3..c00bfdbdd45 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -23,6 +23,20 @@ struct task_struct *__switch_to(struct task_struct *prev,
#ifdef CONFIG_X86_32
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary \
+ "movl %P[task_canary](%[next]), %%ebx\n\t" \
+ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
+#define __switch_canary_oparam \
+ , [stack_canary] "=m" (per_cpu_var(stack_canary))
+#define __switch_canary_iparam \
+ , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else /* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif /* CC_STACKPROTECTOR */
+
/*
* Saving eflags is important. It switches not only IOPL between tasks,
* it also protects other tasks from NT leaking through sysenter etc.
@@ -44,6 +58,7 @@ do { \
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \
"movl $1f,%[prev_ip]\n\t" /* save EIP */ \
"pushl %[next_ip]\n\t" /* restore EIP */ \
+ __switch_canary \
"jmp __switch_to\n" /* regparm call */ \
"1:\t" \
"popl %%ebp\n\t" /* restore EBP */ \
@@ -58,6 +73,8 @@ do { \
"=b" (ebx), "=c" (ecx), "=d" (edx), \
"=S" (esi), "=D" (edi) \
\
+ __switch_canary_oparam \
+ \
/* input parameters: */ \
: [next_sp] "m" (next->thread.sp), \
[next_ip] "m" (next->thread.ip), \
@@ -66,6 +83,8 @@ do { \
[prev] "a" (prev), \
[next] "d" (next) \
\
+ __switch_canary_iparam \
+ \
: /* reloaded segment registers */ \
"memory"); \
} while (0)
@@ -182,6 +201,25 @@ extern void native_load_gs_index(unsigned);
#define savesegment(seg, value) \
asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+/*
+ * x86_32 user gs accessors.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
+#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
+#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
+#define task_user_gs(tsk) ((tsk)->thread.gs)
+#define lazy_save_gs(v) savesegment(gs, (v))
+#define lazy_load_gs(v) loadsegment(gs, (v))
+#else /* X86_32_LAZY_GS */
+#define get_user_gs(regs) (u16)((regs)->gs)
+#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v) do { } while (0)
+#define lazy_load_gs(v) do { } while (0)
+#endif /* X86_32_LAZY_GS */
+#endif /* X86_32 */
+
static inline unsigned long get_limit(unsigned long segment)
{
unsigned long __limit;
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index cf3bb053da0..0d5342515b8 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -41,7 +41,7 @@ dotraplinkage void do_int3(struct pt_regs *, long);
dotraplinkage void do_overflow(struct pt_regs *, long);
dotraplinkage void do_bounds(struct pt_regs *, long);
dotraplinkage void do_invalid_op(struct pt_regs *, long);
-dotraplinkage void do_device_not_available(struct pt_regs);
+dotraplinkage void do_device_not_available(struct pt_regs *, long);
dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
dotraplinkage void do_segment_not_present(struct pt_regs *, long);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 0ec6de4bcb0..b685ece89d5 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -186,7 +186,7 @@ extern int __get_user_bad(void);
#ifdef CONFIG_X86_32
-#define __put_user_asm_u64(x, addr, err) \
+#define __put_user_asm_u64(x, addr, err, errret) \
asm volatile("1: movl %%eax,0(%2)\n" \
"2: movl %%edx,4(%2)\n" \
"3:\n" \
@@ -197,7 +197,7 @@ extern int __get_user_bad(void);
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=r" (err) \
- : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err))
+ : "A" (x), "r" (addr), "i" (errret), "0" (err))
#define __put_user_asm_ex_u64(x, addr) \
asm volatile("1: movl %%eax,0(%1)\n" \
@@ -211,8 +211,8 @@ extern int __get_user_bad(void);
asm volatile("call __put_user_8" : "=a" (__ret_pu) \
: "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
#else
-#define __put_user_asm_u64(x, ptr, retval) \
- __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT)
+#define __put_user_asm_u64(x, ptr, retval, errret) \
+ __put_user_asm(x, ptr, retval, "q", "", "Zr", errret)
#define __put_user_asm_ex_u64(x, addr) \
__put_user_asm_ex(x, addr, "q", "", "Zr")
#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
@@ -289,7 +289,8 @@ do { \
__put_user_asm(x, ptr, retval, "l", "k", "ir", errret); \
break; \
case 8: \
- __put_user_asm_u64((__typeof__(*ptr))(x), ptr, retval); \
+ __put_user_asm_u64((__typeof__(*ptr))(x), ptr, retval, \
+ errret); \
break; \
default: \
__put_user_bad(); \
@@ -525,8 +526,6 @@ struct __large_struct { unsigned long buf[100]; };
*/
#define get_user_try uaccess_try
#define get_user_catch(err) uaccess_catch(err)
-#define put_user_try uaccess_try
-#define put_user_catch(err) uaccess_catch(err)
#define get_user_ex(x, ptr) do { \
unsigned long __gue_val; \
@@ -534,9 +533,29 @@ struct __large_struct { unsigned long buf[100]; };
(x) = (__force __typeof__(*(ptr)))__gue_val; \
} while (0)
+#ifdef CONFIG_X86_WP_WORKS_OK
+
+#define put_user_try uaccess_try
+#define put_user_catch(err) uaccess_catch(err)
+
#define put_user_ex(x, ptr) \
__put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
+#else /* !CONFIG_X86_WP_WORKS_OK */
+
+#define put_user_try do { \
+ int __uaccess_err = 0;
+
+#define put_user_catch(err) \
+ (err) |= __uaccess_err; \
+} while (0)
+
+#define put_user_ex(x, ptr) do { \
+ __uaccess_err |= __put_user(x, ptr); \
+} while (0)
+
+#endif /* CONFIG_X86_WP_WORKS_OK */
+
/*
* movsl can be slow when source and dest are not both 8-byte aligned
*/
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 8ac1d7e312f..8242bf96581 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -3,6 +3,9 @@
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+struct cpumask;
+struct mm_struct;
+
#ifdef CONFIG_X86_UV
extern enum uv_system_type get_uv_system_type(void);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ee4df08feee..fbf2f33e308 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -75,6 +75,7 @@ void foo(void)
OFFSET(PT_DS, pt_regs, ds);
OFFSET(PT_ES, pt_regs, es);
OFFSET(PT_FS, pt_regs, fs);
+ OFFSET(PT_GS, pt_regs, gs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
OFFSET(PT_EIP, pt_regs, ip);
OFFSET(PT_CS, pt_regs, cs);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cbcdb796d47..e8f4a386bd9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -39,6 +39,7 @@
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/hypervisor.h>
+#include <asm/stackprotector.h>
#include "cpu.h"
@@ -122,6 +123,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
[GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+ GDT_STACK_CANARY_INIT
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
@@ -304,6 +306,7 @@ void load_percpu_segment(int cpu)
loadsegment(gs, 0);
wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
#endif
+ load_stack_canary_segment();
}
/* Current gdt points %fs at the "master" per-cpu area: after this,
@@ -938,12 +941,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE);
-#ifdef CONFIG_SMP
-DEFINE_PER_CPU(char *, irq_stack_ptr); /* will be set during per cpu init */
-#else
DEFINE_PER_CPU(char *, irq_stack_ptr) =
- per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
-#endif
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
DEFINE_PER_CPU(unsigned long, kernel_stack) =
(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
@@ -986,16 +985,21 @@ unsigned long kernel_eflags;
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
-#else
+#else /* x86_64 */
-/* Make sure %fs is initialized properly in idle threads */
+#ifdef CONFIG_CC_STACKPROTECTOR
+DEFINE_PER_CPU(unsigned long, stack_canary);
+#endif
+
+/* Make sure %fs and %gs are initialized properly in idle threads */
struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
regs->fs = __KERNEL_PERCPU;
+ regs->gs = __KERNEL_STACK_CANARY;
return regs;
}
-#endif
+#endif /* x86_64 */
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -1157,9 +1161,6 @@ void __cpuinit cpu_init(void)
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
- /* Clear %gs. */
- asm volatile ("mov %0, %%gs" : : "r" (0));
-
/* Clear all 6 debug registers: */
set_debugreg(0, 0);
set_debugreg(0, 1);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 65efd42454b..e9920683145 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -30,12 +30,13 @@
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
- * 28(%esp) - orig_eax
- * 2C(%esp) - %eip
- * 30(%esp) - %cs
- * 34(%esp) - %eflags
- * 38(%esp) - %oldesp
- * 3C(%esp) - %oldss
+ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
@@ -101,121 +102,221 @@
#define resume_userspace_sig resume_userspace
#endif
-#define SAVE_ALL \
- cld; \
- pushl %fs; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET fs, 0;*/\
- pushl %es; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET es, 0;*/\
- pushl %ds; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET ds, 0;*/\
- pushl %eax; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET eax, 0;\
- pushl %ebp; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ebp, 0;\
- pushl %edi; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET edi, 0;\
- pushl %esi; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET esi, 0;\
- pushl %edx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET edx, 0;\
- pushl %ecx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ecx, 0;\
- pushl %ebx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ebx, 0;\
- movl $(__USER_DS), %edx; \
- movl %edx, %ds; \
- movl %edx, %es; \
- movl $(__KERNEL_PERCPU), %edx; \
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc. Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+.endm
+.macro POP_GS pop=0
+ addl $(4 + \pop), %esp
+ CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+ pushl %gs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98: popl %gs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE gs*/
+ .if \pop <> 0
+ add $\pop, %esp
+ CFI_ADJUST_CFA_OFFSET -\pop
+ .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, (%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98: mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, PT_GS(%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+ movl %gs, \reg
+ /*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+ movl \reg, PT_GS(%esp)
+ /*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+ movl $(__KERNEL_STACK_CANARY), \reg
+ movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+ cld
+ PUSH_GS
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0;*/
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0;*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0;*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ movl $(__KERNEL_PERCPU), %edx
movl %edx, %fs
+ SET_KERNEL_GS %edx
+.endm
-#define RESTORE_INT_REGS \
- popl %ebx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ebx;\
- popl %ecx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ecx;\
- popl %edx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE edx;\
- popl %esi; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE esi;\
- popl %edi; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE edi;\
- popl %ebp; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ebp;\
- popl %eax; \
- CFI_ADJUST_CFA_OFFSET -4;\
+.macro RESTORE_INT_REGS
+ popl %ebx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebx
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ecx
+ popl %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edx
+ popl %esi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE esi
+ popl %edi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edi
+ popl %ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebp
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
CFI_RESTORE eax
+.endm
-#define RESTORE_REGS \
- RESTORE_INT_REGS; \
-1: popl %ds; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE ds;*/\
-2: popl %es; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE es;*/\
-3: popl %fs; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE fs;*/\
-.pushsection .fixup,"ax"; \
-4: movl $0,(%esp); \
- jmp 1b; \
-5: movl $0,(%esp); \
- jmp 2b; \
-6: movl $0,(%esp); \
- jmp 3b; \
-.section __ex_table,"a";\
- .align 4; \
- .long 1b,4b; \
- .long 2b,5b; \
- .long 3b,6b; \
+.macro RESTORE_REGS pop=0
+ RESTORE_INT_REGS
+1: popl %ds
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE ds;*/
+2: popl %es
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE es;*/
+3: popl %fs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE fs;*/
+ POP_GS \pop
+.pushsection .fixup, "ax"
+4: movl $0, (%esp)
+ jmp 1b
+5: movl $0, (%esp)
+ jmp 2b
+6: movl $0, (%esp)
+ jmp 3b
+.section __ex_table, "a"
+ .align 4
+ .long 1b, 4b
+ .long 2b, 5b
+ .long 3b, 6b
.popsection
+ POP_GS_EX
+.endm
-#define RING0_INT_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, 3*4;\
- /*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_INT_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 3*4
+ /*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
+.endm
-#define RING0_EC_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, 4*4;\
- /*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_EC_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 4*4
+ /*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
+.endm
-#define RING0_PTREGS_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
- CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
- CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
- CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
- CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
- CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+.macro RING0_PTREGS_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
+ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
+ CFI_OFFSET eip, PT_EIP-PT_OLDESP
+ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
+ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
+ CFI_OFFSET eax, PT_EAX-PT_OLDESP
+ CFI_OFFSET ebp, PT_EBP-PT_OLDESP
+ CFI_OFFSET edi, PT_EDI-PT_OLDESP
+ CFI_OFFSET esi, PT_ESI-PT_OLDESP
+ CFI_OFFSET edx, PT_EDX-PT_OLDESP
+ CFI_OFFSET ecx, PT_ECX-PT_OLDESP
CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+.endm
ENTRY(ret_from_fork)
CFI_STARTPROC
@@ -362,6 +463,7 @@ sysenter_exit:
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
+ PTGS_TO_GS
ENABLE_INTERRUPTS_SYSEXIT
#ifdef CONFIG_AUDITSYSCALL
@@ -410,6 +512,7 @@ sysexit_audit:
.align 4
.long 1b,2b
.popsection
+ PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
# system call handler stub
@@ -452,8 +555,7 @@ restore_all:
restore_nocheck:
TRACE_IRQS_IRET
restore_nocheck_notrace:
- RESTORE_REGS
- addl $4, %esp # skip orig_eax/error_code
+ RESTORE_REGS 4 # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
@@ -595,28 +697,50 @@ syscall_badsys:
END(syscall_badsys)
CFI_ENDPROC
-#define FIXUP_ESPFIX_STACK \
- /* since we are on a wrong stack, we cant make it a C code :( */ \
- PER_CPU(gdt_page, %ebx); \
- GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
- addl %esp, %eax; \
- pushl $__KERNEL_DS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl %eax; \
- CFI_ADJUST_CFA_OFFSET 4; \
- lss (%esp), %esp; \
- CFI_ADJUST_CFA_OFFSET -8;
-#define UNWIND_ESPFIX_STACK \
- movl %ss, %eax; \
- /* see if on espfix stack */ \
- cmpw $__ESPFIX_SS, %ax; \
- jne 27f; \
- movl $__KERNEL_DS, %eax; \
- movl %eax, %ds; \
- movl %eax, %es; \
- /* switch to normal stack */ \
- FIXUP_ESPFIX_STACK; \
-27:;
+/*
+ * System calls that need a pt_regs pointer.
+ */
+#define PTREGSCALL(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%eax; \
+ jmp sys_##name;
+
+PTREGSCALL(iopl)
+PTREGSCALL(fork)
+PTREGSCALL(clone)
+PTREGSCALL(vfork)
+PTREGSCALL(execve)
+PTREGSCALL(sigaltstack)
+PTREGSCALL(sigreturn)
+PTREGSCALL(rt_sigreturn)
+PTREGSCALL(vm86)
+PTREGSCALL(vm86old)
+
+.macro FIXUP_ESPFIX_STACK
+ /* since we are on a wrong stack, we cant make it a C code :( */
+ PER_CPU(gdt_page, %ebx)
+ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ addl %esp, %eax
+ pushl $__KERNEL_DS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ lss (%esp), %esp
+ CFI_ADJUST_CFA_OFFSET -8
+.endm
+.macro UNWIND_ESPFIX_STACK
+ movl %ss, %eax
+ /* see if on espfix stack */
+ cmpw $__ESPFIX_SS, %ax
+ jne 27f
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ /* switch to normal stack */
+ FIXUP_ESPFIX_STACK
+27:
+.endm
/*
* Build the entry stubs and pointer table with some assembler magic.
@@ -1070,7 +1194,10 @@ ENTRY(page_fault)
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
- /* the function address is in %fs's slot on the stack */
+ /* the function address is in %gs's slot on the stack */
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
pushl %es
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
@@ -1099,20 +1226,15 @@ error_code:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
UNWIND_ESPFIX_STACK
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_REGISTER es, ecx*/
- movl PT_FS(%esp), %edi # get the function address
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_FS(%esp)
- /*CFI_REL_OFFSET fs, ES*/
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
@@ -1136,26 +1258,27 @@ END(page_fault)
* by hand onto the new stack - while updating the return eip past
* the instruction that would have done it for sysenter.
*/
-#define FIX_STACK(offset, ok, label) \
- cmpw $__KERNEL_CS,4(%esp); \
- jne ok; \
-label: \
- movl TSS_sysenter_sp0+offset(%esp),%esp; \
- CFI_DEF_CFA esp, 0; \
- CFI_UNDEFINED eip; \
- pushfl; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $__KERNEL_CS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $sysenter_past_esp; \
- CFI_ADJUST_CFA_OFFSET 4; \
+.macro FIX_STACK offset ok label
+ cmpw $__KERNEL_CS, 4(%esp)
+ jne \ok
+\label:
+ movl TSS_sysenter_sp0 + \offset(%esp), %esp
+ CFI_DEF_CFA esp, 0
+ CFI_UNDEFINED eip
+ pushfl
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $__KERNEL_CS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $sysenter_past_esp
+ CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eip, 0
+.endm
ENTRY(debug)
RING0_INT_FRAME
cmpl $ia32_sysenter_target,(%esp)
jne debug_stack_correct
- FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+ FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
debug_stack_correct:
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
@@ -1213,7 +1336,7 @@ nmi_stack_correct:
nmi_stack_fixup:
RING0_INT_FRAME
- FIX_STACK(12,nmi_stack_correct, 1)
+ FIX_STACK 12, nmi_stack_correct, 1
jmp nmi_stack_correct
nmi_debug_stack_check:
@@ -1224,7 +1347,7 @@ nmi_debug_stack_check:
jb nmi_stack_correct
cmpl $debug_esp_fix_insn,(%esp)
ja nmi_stack_correct
- FIX_STACK(24,nmi_stack_correct, 1)
+ FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
nmi_espfix_stack:
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 722464c520c..2a0aad7718d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,6 +19,7 @@
#include <asm/asm-offsets.h>
#include <asm/setup.h>
#include <asm/processor-flags.h>
+#include <asm/percpu.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -437,8 +438,26 @@ is386: movl $2,%ecx # set MP
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
- xorl %eax,%eax # Clear GS and LDT
+#ifdef CONFIG_CC_STACKPROTECTOR
+ /*
+ * The linker can't handle this by relocation. Manually set
+ * base address in stack canary segment descriptor.
+ */
+ cmpb $0,ready
+ jne 1f
+ movl $per_cpu__gdt_page,%eax
+ movl $per_cpu__stack_canary,%ecx
+ subl $20, %ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+1:
+#endif
+ movl $(__KERNEL_STACK_CANARY),%eax
movl %eax,%gs
+
+ xorl %eax,%eax # Clear LDT
lldt %ax
cld # gcc2 wants the direction flag cleared at all times
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a0a2b5ca9b7..2e648e3a5ea 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -205,19 +205,6 @@ ENTRY(secondary_startup_64)
pushq $0
popfq
-#ifdef CONFIG_SMP
- /*
- * Fix up static pointers that need __per_cpu_load added. The assembler
- * is unable to do this directly. This is only needed for the boot cpu.
- * These values are set up with the correct base addresses by C code for
- * secondary cpus.
- */
- movq initial_gs(%rip), %rax
- cmpl $0, per_cpu__cpu_number(%rax)
- jne 1f
- addq %rax, early_gdt_descr_base(%rip)
-1:
-#endif
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
@@ -275,11 +262,7 @@ ENTRY(secondary_startup_64)
ENTRY(initial_code)
.quad x86_64_start_kernel
ENTRY(initial_gs)
-#ifdef CONFIG_SMP
- .quad __per_cpu_load
-#else
- .quad PER_CPU_VAR(irq_stack_union)
-#endif
+ .quad INIT_PER_CPU_VAR(irq_stack_union)
__FINITDATA
ENTRY(stack_start)
@@ -425,7 +408,7 @@ NEXT_PAGE(level2_spare_pgt)
early_gdt_descr:
.word GDT_ENTRIES*8-1
early_gdt_descr_base:
- .quad per_cpu__gdt_page
+ .quad INIT_PER_CPU_VAR(gdt_page)
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index b12208f4dfe..e41980a373a 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -131,9 +131,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
}
#ifdef CONFIG_X86_32
-asmlinkage long sys_iopl(unsigned long regsp)
+long sys_iopl(struct pt_regs *regs)
{
- struct pt_regs *regs = (struct pt_regs *)&regsp;
unsigned int level = regs->bx;
struct thread_struct *t = &current->thread;
int rc;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 1a1ae8edc40..fec79ad85dc 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -11,6 +11,7 @@
#include <stdarg.h>
+#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -91,6 +92,15 @@ void cpu_idle(void)
{
int cpu = smp_processor_id();
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+
current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
@@ -131,7 +141,7 @@ void __show_regs(struct pt_regs *regs, int all)
if (user_mode_vm(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
- savesegment(gs, gs);
+ gs = get_user_gs(regs);
} else {
sp = (unsigned long) (&regs->sp);
savesegment(ss, ss);
@@ -212,6 +222,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
regs.ds = __USER_DS;
regs.es = __USER_DS;
regs.fs = __KERNEL_PERCPU;
+ regs.gs = __KERNEL_STACK_CANARY;
regs.orig_ax = -1;
regs.ip = (unsigned long) kernel_thread_helper;
regs.cs = __KERNEL_CS | get_kernel_rpl();
@@ -304,7 +315,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
p->thread.ip = (unsigned long) ret_from_fork;
- savesegment(gs, p->thread.gs);
+ task_user_gs(p) = get_user_gs(regs);
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -342,7 +353,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- __asm__("movl %0, %%gs" : : "r"(0));
+ set_user_gs(regs, 0);
regs->fs = 0;
set_fs(USER_DS);
regs->ds = __USER_DS;
@@ -539,7 +550,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- savesegment(gs, prev->gs);
+ lazy_save_gs(prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -585,31 +596,31 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- loadsegment(gs, next->gs);
+ lazy_load_gs(next->gs);
percpu_write(current_task, next_p);
return prev_p;
}
-asmlinkage int sys_fork(struct pt_regs regs)
+int sys_fork(struct pt_regs *regs)
{
- return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
+ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
}
-asmlinkage int sys_clone(struct pt_regs regs)
+int sys_clone(struct pt_regs *regs)
{
unsigned long clone_flags;
unsigned long newsp;
int __user *parent_tidptr, *child_tidptr;
- clone_flags = regs.bx;
- newsp = regs.cx;
- parent_tidptr = (int __user *)regs.dx;
- child_tidptr = (int __user *)regs.di;
+ clone_flags = regs->bx;
+ newsp = regs->cx;
+ parent_tidptr = (int __user *)regs->dx;
+ child_tidptr = (int __user *)regs->di;
if (!newsp)
- newsp = regs.sp;
- return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
+ newsp = regs->sp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
}
/*
@@ -622,27 +633,27 @@ asmlinkage int sys_clone(struct pt_regs regs)
* do not have enough call-clobbered registers to hold all
* the information you need.
*/
-asmlinkage int sys_vfork(struct pt_regs regs)
+int sys_vfork(struct pt_regs *regs)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL);
}
/*
* sys_execve() executes a new program.
*/
-asmlinkage int sys_execve(struct pt_regs regs)
+int sys_execve(struct pt_regs *regs)
{
int error;
char *filename;
- filename = getname((char __user *) regs.bx);
+ filename = getname((char __user *) regs->bx);
error = PTR_ERR(filename);
if (IS_ERR(filename))
goto out;
error = do_execve(filename,
- (char __user * __user *) regs.cx,
- (char __user * __user *) regs.dx,
- &regs);
+ (char __user * __user *) regs->cx,
+ (char __user * __user *) regs->dx,
+ regs);
if (error == 0) {
/* Make sure we don't return using sysenter.. */
set_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8eb169e4558..836ef6575f0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -120,12 +120,11 @@ void cpu_idle(void)
current_thread_info()->status |= TS_POLLING;
/*
- * If we're the non-boot CPU, nothing set the PDA stack
- * canary up for us - and if we are the boot CPU we have
- * a 0 stack canary. This is a good place for updating
- * it, as we wont ever return from this function (so the
- * invalid canaries already on the stack wont ever
- * trigger):
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
*/
boot_init_stack_canary();
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a5df5f82fb..7ec39ab37a2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -75,10 +75,7 @@ static inline bool invalid_selector(u16 value)
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
- regno >>= 2;
- if (regno > FS)
- --regno;
- return &regs->bx + regno;
+ return &regs->bx + (regno >> 2);
}
static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
@@ -90,9 +87,10 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
if (offset != offsetof(struct user_regs_struct, gs))
retval = *pt_regs_access(task_pt_regs(task), offset);
else {
- retval = task->thread.gs;
if (task == current)
- savesegment(gs, retval);
+ retval = get_user_gs(task_pt_regs(task));
+ else
+ retval = task_user_gs(task);
}
return retval;
}
@@ -126,13 +124,10 @@ static int set_segment_reg(struct task_struct *task,
break;
case offsetof(struct user_regs_struct, gs):
- task->thread.gs = value;
if (task == current)
- /*
- * The user-mode %gs is not affected by
- * kernel entry, so we must update the CPU.
- */
- loadsegment(gs, value);
+ set_user_gs(task_pt_regs(task), value);
+ else
+ task_user_gs(task) = value;
}
return 0;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef91747bbed..d992e6cff73 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -16,6 +16,7 @@
#include <asm/proto.h>
#include <asm/cpumask.h>
#include <asm/cpu.h>
+#include <asm/stackprotector.h>
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
@@ -95,6 +96,7 @@ void __init setup_per_cpu_areas(void)
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
+ setup_stack_canary_segment(cpu);
/*
* Copy data used in early init routines from the
* initial arrays to the per cpu data areas. These
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 7fc78b01981..7cdcd16885e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -50,27 +50,23 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
-#define COPY(x) { \
- get_user_ex(regs->x, &sc->x); \
-}
+#define COPY(x) do { \
+ get_user_ex(regs->x, &sc->x); \
+} while (0)
-#define COPY_SEG(seg) { \
- unsigned short tmp; \
- get_user_ex(tmp, &sc->seg); \
- regs->seg = tmp; \
-}
+#define GET_SEG(seg) ({ \
+ unsigned short tmp; \
+ get_user_ex(tmp, &sc->seg); \
+ tmp; \
+})
-#define COPY_SEG_CPL3(seg) { \
- unsigned short tmp; \
- get_user_ex(tmp, &sc->seg); \
- regs->seg = tmp | 3; \
-}
+#define COPY_SEG(seg) do { \
+ regs->seg = GET_SEG(seg); \
+} while (0)
-#define GET_SEG(seg) { \
- unsigned short tmp; \
- get_user_ex(tmp, &sc->seg); \
- loadsegment(seg, tmp); \
-}
+#define COPY_SEG_CPL3(seg) do { \
+ regs->seg = GET_SEG(seg) | 3; \
+} while (0)
static int
restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
@@ -86,7 +82,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
get_user_try {
#ifdef CONFIG_X86_32
- GET_SEG(gs);
+ set_user_gs(regs, GET_SEG(gs));
COPY_SEG(fs);
COPY_SEG(es);
COPY_SEG(ds);
@@ -138,12 +134,7 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
put_user_try {
#ifdef CONFIG_X86_32
- {
- unsigned int tmp;
-
- savesegment(gs, tmp);
- put_user_ex(tmp, (unsigned int __user *)&sc->gs);
- }
+ put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
put_user_ex(regs->es, (unsigned int __user *)&sc->es);
put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
@@ -558,14 +549,9 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
#endif /* CONFIG_X86_32 */
#ifdef CONFIG_X86_32
-asmlinkage int sys_sigaltstack(unsigned long bx)
+int sys_sigaltstack(struct pt_regs *regs)
{
- /*
- * This is needed to make gcc realize it doesn't own the
- * "struct pt_regs"
- */
- struct pt_regs *regs = (struct pt_regs *)&bx;
- const stack_t __user *uss = (const stack_t __user *)bx;
+ const stack_t __user *uss = (const stack_t __user *)regs->bx;
stack_t __user *uoss = (stack_t __user *)regs->cx;
return do_sigaltstack(uss, uoss, regs->sp);
@@ -583,14 +569,12 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
* Do a signal return; undo the signal stack.
*/
#ifdef CONFIG_X86_32
-asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
+unsigned long sys_sigreturn(struct pt_regs *regs)
{
struct sigframe __user *frame;
- struct pt_regs *regs;
unsigned long ax;
sigset_t set;
- regs = (struct pt_regs *) &__unused;
frame = (struct sigframe __user *)(regs->sp - 8);
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -617,7 +601,7 @@ badframe:
}
#endif /* CONFIG_X86_32 */
-static long do_rt_sigreturn(struct pt_regs *regs)
+long sys_rt_sigreturn(struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
unsigned long ax;
@@ -648,25 +632,6 @@ badframe:
return 0;
}
-#ifdef CONFIG_X86_32
-/*
- * Note: do not pass in pt_regs directly as with tail-call optimization
- * GCC will incorrectly stomp on the caller's frame and corrupt user-space
- * register state:
- */
-asmlinkage int sys_rt_sigreturn(unsigned long __unused)
-{
- struct pt_regs *regs = (struct pt_regs *)&__unused;
-
- return do_rt_sigreturn(regs);
-}
-#else /* !CONFIG_X86_32 */
-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
-{
- return do_rt_sigreturn(regs);
-}
-#endif /* CONFIG_X86_32 */
-
/*
* OK, we're invoking a handler:
*/
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index e2e86a08f31..3bdb64829b8 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -1,7 +1,7 @@
ENTRY(sys_call_table)
.long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
.long sys_exit
- .long sys_fork
+ .long ptregs_fork
.long sys_read
.long sys_write
.long sys_open /* 5 */
@@ -10,7 +10,7 @@ ENTRY(sys_call_table)
.long sys_creat
.long sys_link
.long sys_unlink /* 10 */
- .long sys_execve
+ .long ptregs_execve
.long sys_chdir
.long sys_time
.long sys_mknod
@@ -109,17 +109,17 @@ ENTRY(sys_call_table)
.long sys_newlstat
.long sys_newfstat
.long sys_uname
- .long sys_iopl /* 110 */
+ .long ptregs_iopl /* 110 */
.long sys_vhangup
.long sys_ni_syscall /* old "idle" system call */
- .long sys_vm86old
+ .long ptregs_vm86old
.long sys_wait4
.long sys_swapoff /* 115 */
.long sys_sysinfo
.long sys_ipc
.long sys_fsync
- .long sys_sigreturn
- .long sys_clone /* 120 */
+ .long ptregs_sigreturn
+ .long ptregs_clone /* 120 */
.long sys_setdomainname
.long sys_newuname
.long sys_modify_ldt
@@ -165,14 +165,14 @@ ENTRY(sys_call_table)
.long sys_mremap
.long sys_setresuid16
.long sys_getresuid16 /* 165 */
- .long sys_vm86
+ .long ptregs_vm86
.long sys_ni_syscall /* Old sys_query_module */
.long sys_poll
.long sys_nfsservctl
.long sys_setresgid16 /* 170 */
.long sys_getresgid16
.long sys_prctl
- .long sys_rt_sigreturn
+ .long ptregs_rt_sigreturn
.long sys_rt_sigaction
.long sys_rt_sigprocmask /* 175 */
.long sys_rt_sigpending
@@ -185,11 +185,11 @@ ENTRY(sys_call_table)
.long sys_getcwd
.long sys_capget
.long sys_capset /* 185 */
- .long sys_sigaltstack
+ .long ptregs_sigaltstack
.long sys_sendfile
.long sys_ni_syscall /* reserved for streams1 */
.long sys_ni_syscall /* reserved for streams2 */
- .long sys_vfork /* 190 */
+ .long ptregs_vfork /* 190 */
.long sys_getrlimit
.long sys_mmap2
.long sys_truncate64
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 0d032d2d8a1..bde57f0f161 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -905,19 +905,20 @@ void math_emulate(struct math_emu_info *info)
}
#endif /* CONFIG_MATH_EMULATION */
-dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs)
+dotraplinkage void __kprobes
+do_device_not_available(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_X86_32
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };
- conditional_sti(&regs);
+ conditional_sti(regs);
- info.regs = &regs;
+ info.regs = regs;
math_emulate(&info);
} else {
math_state_restore(); /* interrupts still off */
- conditional_sti(&regs);
+ conditional_sti(regs);
}
#else
math_state_restore();
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 4eeb5cf9720..d7ac84e7fc1 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -158,7 +158,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
ret = KVM86->regs32;
ret->fs = current->thread.saved_fs;
- loadsegment(gs, current->thread.saved_gs);
+ set_user_gs(ret, current->thread.saved_gs);
return ret;
}
@@ -197,9 +197,9 @@ out:
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
-asmlinkage int sys_vm86old(struct pt_regs regs)
+int sys_vm86old(struct pt_regs *regs)
{
- struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
+ struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
@@ -218,7 +218,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
if (tmp)
goto out;
memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
- info.regs32 = &regs;
+ info.regs32 = regs;
tsk->thread.vm86_info = v86;
do_sys_vm86(&info, tsk);
ret = 0; /* we never return here */
@@ -227,7 +227,7 @@ out:
}
-asmlinkage int sys_vm86(struct pt_regs regs)
+int sys_vm86(struct pt_regs *regs)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
@@ -239,12 +239,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
struct vm86plus_struct __user *v86;
tsk = current;
- switch (regs.bx) {
+ switch (regs->bx) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
case VM86_GET_IRQ_BITS:
case VM86_GET_AND_RESET_IRQ:
- ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
+ ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);
goto out;
case VM86_PLUS_INSTALL_CHECK:
/*
@@ -261,14 +261,14 @@ asmlinkage int sys_vm86(struct pt_regs regs)
ret = -EPERM;
if (tsk->thread.saved_sp0)
goto out;
- v86 = (struct vm86plus_struct __user *)regs.cx;
+ v86 = (struct vm86plus_struct __user *)regs->cx;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, regs32) -
sizeof(info.regs));
ret = -EFAULT;
if (tmp)
goto out;
- info.regs32 = &regs;
+ info.regs32 = regs;
info.vm86plus.is_vm86pus = 1;
tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
do_sys_vm86(&info, tsk);
@@ -323,7 +323,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
info->regs32->ax = 0;
tsk->thread.saved_sp0 = tsk->thread.sp0;
tsk->thread.saved_fs = info->regs32->fs;
- savesegment(gs, tsk->thread.saved_gs);
+ tsk->thread.saved_gs = get_user_gs(info->regs32);
tss = &per_cpu(init_tss, get_cpu());
tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 07f62d287ff..087a7f2c639 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -257,6 +257,14 @@ SECTIONS
DWARF_DEBUG
}
+ /*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
/*
* Build-time check on the image size:
*/
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 19e33b6cd59..da2e314f61b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -283,7 +283,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
/* There's one problem which normal hardware doesn't have: the Host
* can't handle us removing entries we're currently using. So we clear
* the GS register here: if it's needed it'll be reloaded anyway. */
- loadsegment(gs, 0);
+ lazy_load_gs(0);
lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
}
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 420b3b6e391..6ef5e99380f 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -150,11 +150,9 @@ static long pm_address(u_char FPU_modrm, u_char segment,
#endif /* PARANOID */
switch (segment) {
- /* gs isn't used by the kernel, so it still has its
- user-space value. */
case PREFIX_GS_ - 1:
- /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
- savesegment(gs, addr->selector);
+ /* user gs handling can be lazy, use special accessors */
+ addr->selector = get_user_gs(FPU_info->regs);
break;
default:
addr->selector = PM_REG_(segment);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 08d140fbc31..deb1c1ab786 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -702,7 +702,7 @@ void __cpuinit numa_set_node(int cpu, int node)
}
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
- if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
+ if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
dump_stack();
return;
@@ -790,7 +790,7 @@ int early_cpu_to_node(int cpu)
if (early_per_cpu_ptr(x86_cpu_to_node_map))
return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
- if (!per_cpu_offset(cpu)) {
+ if (!cpu_possible(cpu)) {
printk(KERN_WARNING
"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
dump_stack();
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 4d6ef0a336d..16a9020c8f1 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -38,7 +38,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS))
+ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector)
$(vobjs): KBUILD_CFLAGS += $(CFL)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 37230342c2c..95ff6a0e942 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -323,13 +323,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
/*
- * XXX sleazy hack: If we're being called in a lazy-cpu zone,
- * it means we're in a context switch, and %gs has just been
- * saved. This means we can zero it out to prevent faults on
- * exit from the hypervisor if the next process has no %gs.
- * Either way, it has been saved, and the new value will get
- * loaded properly. This will go away as soon as Xen has been
- * modified to not save/restore %gs for normal hypercalls.
+ * XXX sleazy hack: If we're being called in a lazy-cpu zone
+ * and lazy gs handling is enabled, it means we're in a
+ * context switch, and %gs has just been saved. This means we
+ * can zero it out to prevent faults on exit from the
+ * hypervisor if the next process has no %gs. Either way, it
+ * has been saved, and the new value will get loaded properly.
+ * This will go away as soon as Xen has been modified to not
+ * save/restore %gs for normal hypercalls.
*
* On x86_64, this hack is not used for %gs, because gs points
* to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -341,7 +342,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
*/
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
#ifdef CONFIG_X86_32
- loadsegment(gs, 0);
+ lazy_load_gs(0);
#else
loadsegment(fs, 0);
#endif
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 4c6f9679913..79d7362ad6d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -1,14 +1,14 @@
/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in percpu data) of
- the operations here; the indirect forms are better handled in
- C, since they're generally too large to inline anyway.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in percpu data) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
*/
#include <asm/asm-offsets.h>
@@ -18,17 +18,19 @@
#include "xen-asm.h"
/*
- Enable events. This clears the event mask and tests the pending
- event status with one and operation. If there are pending
- events, then enter the hypervisor to get them handled.
+ * Enable events. This clears the event mask and tests the pending
+ * event status with one and operation. If there are pending events,
+ * then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
@@ -43,8 +45,8 @@ ENDPATCH(xen_irq_enable_direct)
/*
- Disabling events is simply a matter of making the event mask
- non-zero.
+ * Disabling events is simply a matter of making the event mask
+ * non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
@@ -54,18 +56,18 @@ ENDPATCH(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
- (xen_)save_fl is used to get the current interrupt enable status.
- Callers expect the status to be in X86_EFLAGS_IF, and other bits
- may be set in the return value. We take advantage of this by
- making sure that X86_EFLAGS_IF has the right value (and other bits
- in that byte are 0), but other bits in the return value are
- undefined. We need to toggle the state of the bit, because
- Xen and x86 use opposite senses (mask vs enable).
+ * (xen_)save_fl is used to get the current interrupt enable status.
+ * Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ * may be set in the return value. We take advantage of this by
+ * making sure that X86_EFLAGS_IF has the right value (and other bits
+ * in that byte are 0), but other bits in the return value are
+ * undefined. We need to toggle the state of the bit, because Xen and
+ * x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
- addb %ah,%ah
+ addb %ah, %ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
@@ -73,12 +75,11 @@ ENDPATCH(xen_save_fl_direct)
/*
- In principle the caller should be passing us a value return
- from xen_save_fl_direct, but for robustness sake we test only
- the X86_EFLAGS_IF flag rather than the whole byte. After
- setting the interrupt mask state, it checks for unmasked
- pending events and enters the hypervisor to get them delivered
- if so.
+ * In principle the caller should be passing us a value return from
+ * xen_save_fl_direct, but for robustness sake we test only the
+ * X86_EFLAGS_IF flag rather than the whole byte. After setting the
+ * interrupt mask state, it checks for unmasked pending events and
+ * enters the hypervisor to get them delivered if so.
*/
ENTRY(xen_restore_fl_direct)
#ifdef CONFIG_X86_64
@@ -87,9 +88,11 @@ ENTRY(xen_restore_fl_direct)
testb $X86_EFLAGS_IF>>8, %ah
#endif
setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
@@ -103,8 +106,8 @@ ENDPATCH(xen_restore_fl_direct)
/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
*/
check_events:
#ifdef CONFIG_X86_32
@@ -137,4 +140,3 @@ check_events:
pop %rax
#endif
ret
-
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 082d173caaf..88e15deb8b8 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,17 +1,16 @@
/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
*/
-//#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
@@ -21,8 +20,8 @@
#include "xen-asm.h"
/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
*/
check_events:
push %eax
@@ -35,10 +34,10 @@ check_events:
ret
/*
- We can't use sysexit directly, because we're not running in ring0.
- But we can easily fake it up using iret. Assuming xen_sysexit
- is jumped to with a standard stack frame, we can just strip it
- back to a standard iret frame and use iret.
+ * We can't use sysexit directly, because we're not running in ring0.
+ * But we can easily fake it up using iret. Assuming xen_sysexit is
+ * jumped to with a standard stack frame, we can just strip it back to
+ * a standard iret frame and use iret.
*/
ENTRY(xen_sysexit)
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
@@ -49,33 +48,31 @@ ENTRY(xen_sysexit)
ENDPROC(xen_sysexit)
/*
- This is run where a normal iret would be run, with the same stack setup:
- 8: eflags
- 4: cs
- esp-> 0: eip
-
- This attempts to make sure that any pending events are dealt
- with on return to usermode, but there is a small window in
- which an event can happen just before entering usermode. If
- the nested interrupt ends up setting one of the TIF_WORK_MASK
- pending work flags, they will not be tested again before
- returning to usermode. This means that a process can end up
- with pending work, which will be unprocessed until the process
- enters and leaves the kernel again, which could be an
- unbounded amount of time. This means that a pending signal or
- reschedule event could be indefinitely delayed.
-
- The fix is to notice a nested interrupt in the critical
- window, and if one occurs, then fold the nested interrupt into
- the current interrupt stack frame, and re-process it
- iteratively rather than recursively. This means that it will
- exit via the normal path, and all pending work will be dealt
- with appropriately.
-
- Because the nested interrupt handler needs to deal with the
- current stack state in whatever form its in, we keep things
- simple by only using a single register which is pushed/popped
- on the stack.
+ * This is run where a normal iret would be run, with the same stack setup:
+ * 8: eflags
+ * 4: cs
+ * esp-> 0: eip
+ *
+ * This attempts to make sure that any pending events are dealt with
+ * on return to usermode, but there is a small window in which an
+ * event can happen just before entering usermode. If the nested
+ * interrupt ends up setting one of the TIF_WORK_MASK pending work
+ * flags, they will not be tested again before returning to
+ * usermode. This means that a process can end up with pending work,
+ * which will be unprocessed until the process enters and leaves the
+ * kernel again, which could be an unbounded amount of time. This
+ * means that a pending signal or reschedule event could be
+ * indefinitely delayed.
+ *
+ * The fix is to notice a nested interrupt in the critical window, and
+ * if one occurs, then fold the nested interrupt into the current
+ * interrupt stack frame, and re-process it iteratively rather than
+ * recursively. This means that it will exit via the normal path, and
+ * all pending work will be dealt with appropriately.
+ *
+ * Because the nested interrupt handler needs to deal with the current
+ * stack state in whatever form its in, we keep things simple by only
+ * using a single register which is pushed/popped on the stack.
*/
ENTRY(xen_iret)
/* test eflags for special cases */
@@ -85,13 +82,15 @@ ENTRY(xen_iret)
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
- /* Store vcpu_info pointer for easy access. Do it this
- way to avoid having to reload %fs */
+ /*
+ * Store vcpu_info pointer for easy access. Do it this way to
+ * avoid having to reload %fs
+ */
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
- movl TI_cpu(%eax),%eax
- movl __per_cpu_offset(,%eax,4),%eax
- mov per_cpu__xen_vcpu(%eax),%eax
+ movl TI_cpu(%eax), %eax
+ movl __per_cpu_offset(,%eax,4), %eax
+ mov per_cpu__xen_vcpu(%eax), %eax
#else
movl per_cpu__xen_vcpu, %eax
#endif
@@ -99,37 +98,46 @@ ENTRY(xen_iret)
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
- /* Maybe enable events. Once this happens we could get a
- recursive event, so the critical region starts immediately
- afterwards. However, if that happens we don't end up
- resuming the code, so we don't have to be worried about
- being preempted to another CPU. */
+ /*
+ * Maybe enable events. Once this happens we could get a
+ * recursive event, so the critical region starts immediately
+ * afterwards. However, if that happens we don't end up
+ * resuming the code, so we don't have to be worried about
+ * being preempted to another CPU.
+ */
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
- /* If there's something pending, mask events again so we
- can jump back into xen_hypervisor_callback */
+ /*
+ * If there's something pending, mask events again so we can
+ * jump back into xen_hypervisor_callback
+ */
sete XEN_vcpu_info_mask(%eax)
popl %eax
- /* From this point on the registers are restored and the stack
- updated, so we don't need to worry about it if we're preempted */
+ /*
+ * From this point on the registers are restored and the stack
+ * updated, so we don't need to worry about it if we're
+ * preempted
+ */
iret_restore_end:
- /* Jump to hypervisor_callback after fixing up the stack.
- Events are masked, so jumping out of the critical
- region is OK. */
+ /*
+ * Jump to hypervisor_callback after fixing up the stack.
+ * Events are masked, so jumping out of the critical region is
+ * OK.
+ */
je xen_hypervisor_callback
1: iret
xen_iret_end_crit:
-.section __ex_table,"a"
+.section __ex_table, "a"
.align 4
- .long 1b,iret_exc
+ .long 1b, iret_exc
.previous
hyper_iret:
@@ -139,55 +147,55 @@ hyper_iret:
.globl xen_iret_start_crit, xen_iret_end_crit
/*
- This is called by xen_hypervisor_callback in entry.S when it sees
- that the EIP at the time of interrupt was between xen_iret_start_crit
- and xen_iret_end_crit. We're passed the EIP in %eax so we can do
- a more refined determination of what to do.
-
- The stack format at this point is:
- ----------------
- ss : (ss/esp may be present if we came from usermode)
- esp :
- eflags } outer exception info
- cs }
- eip }
- ---------------- <- edi (copy dest)
- eax : outer eax if it hasn't been restored
- ----------------
- eflags } nested exception info
- cs } (no ss/esp because we're nested
- eip } from the same ring)
- orig_eax }<- esi (copy src)
- - - - - - - - -
- fs }
- es }
- ds } SAVE_ALL state
- eax }
- : :
- ebx }<- esp
- ----------------
-
- In order to deliver the nested exception properly, we need to shift
- everything from the return addr up to the error code so it
- sits just under the outer exception info. This means that when we
- handle the exception, we do it in the context of the outer exception
- rather than starting a new one.
-
- The only caveat is that if the outer eax hasn't been
- restored yet (ie, it's still on stack), we need to insert
- its value into the SAVE_ALL state before going on, since
- it's usermode state which we eventually need to restore.
+ * This is called by xen_hypervisor_callback in entry.S when it sees
+ * that the EIP at the time of interrupt was between
+ * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
+ * %eax so we can do a more refined determination of what to do.
+ *
+ * The stack format at this point is:
+ * ----------------
+ * ss : (ss/esp may be present if we came from usermode)
+ * esp :
+ * eflags } outer exception info
+ * cs }
+ * eip }
+ * ---------------- <- edi (copy dest)
+ * eax : outer eax if it hasn't been restored
+ * ----------------
+ * eflags } nested exception info
+ * cs } (no ss/esp because we're nested
+ * eip } from the same ring)
+ * orig_eax }<- esi (copy src)
+ * - - - - - - - -
+ * fs }
+ * es }
+ * ds } SAVE_ALL state
+ * eax }
+ * : :
+ * ebx }<- esp
+ * ----------------
+ *
+ * In order to deliver the nested exception properly, we need to shift
+ * everything from the return addr up to the error code so it sits
+ * just under the outer exception info. This means that when we
+ * handle the exception, we do it in the context of the outer
+ * exception rather than starting a new one.
+ *
+ * The only caveat is that if the outer eax hasn't been restored yet
+ * (ie, it's still on stack), we need to insert its value into the
+ * SAVE_ALL state before going on, since it's usermode state which we
+ * eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/*
- Paranoia: Make sure we're really coming from kernel space.
- One could imagine a case where userspace jumps into the
- critical range address, but just before the CPU delivers a GP,
- it decides to deliver an interrupt instead. Unlikely?
- Definitely. Easy to avoid? Yes. The Intel documents
- explicitly say that the reported EIP for a bad jump is the
- jump instruction itself, not the destination, but some virtual
- environments get this wrong.
+ * Paranoia: Make sure we're really coming from kernel space.
+ * One could imagine a case where userspace jumps into the
+ * critical range address, but just before the CPU delivers a
+ * GP, it decides to deliver an interrupt instead. Unlikely?
+ * Definitely. Easy to avoid? Yes. The Intel documents
+ * explicitly say that the reported EIP for a bad jump is the
+ * jump instruction itself, not the destination, but some
+ * virtual environments get this wrong.
*/
movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
@@ -197,15 +205,17 @@ ENTRY(xen_iret_crit_fixup)
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
- /* If eip is before iret_restore_end then stack
- hasn't been restored yet. */
+ /*
+ * If eip is before iret_restore_end then stack
+ * hasn't been restored yet.
+ */
cmp $iret_restore_end, %eax
jae 1f
- movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
+ movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
movl %eax, PT_EAX(%esp)
- lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
+ lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
/* set up the copy */
1: std
@@ -213,6 +223,6 @@ ENTRY(xen_iret_crit_fixup)
rep movsl
cld
- lea 4(%edi),%esp /* point esp to new frame */
+ lea 4(%edi), %esp /* point esp to new frame */
2: jmp xen_do_upcall
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index d205a283efe..02f496a8dba 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,14 +1,14 @@
/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
*/
#include <asm/errno.h>
@@ -21,25 +21,25 @@
#include "xen-asm.h"
ENTRY(xen_adjust_exception_frame)
- mov 8+0(%rsp),%rcx
- mov 8+8(%rsp),%r11
+ mov 8+0(%rsp), %rcx
+ mov 8+8(%rsp), %r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
- Xen64 iret frame:
-
- ss
- rsp
- rflags
- cs
- rip <-- standard iret frame
-
- flags
-
- rcx }
- r11 }<-- pushed by hypercall page
-rsp -> rax }
+ * Xen64 iret frame:
+ *
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip <-- standard iret frame
+ *
+ * flags
+ *
+ * rcx }
+ * r11 }<-- pushed by hypercall page
+ * rsp->rax }
*/
ENTRY(xen_iret)
pushq $0
@@ -48,8 +48,8 @@ ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
- sysexit is not used for 64-bit processes, so it's
- only ever used to return to 32-bit compat userspace.
+ * sysexit is not used for 64-bit processes, so it's only ever used to
+ * return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
@@ -64,10 +64,12 @@ ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
- /* We're already on the usermode stack at this point, but still
- with the kernel gs, so we can easily switch back */
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
movq %rsp, PER_CPU_VAR(old_rsp)
- movq PER_CPU_VAR(kernel_stack),%rsp
+ movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
pushq PER_CPU_VAR(old_rsp)
@@ -81,8 +83,10 @@ ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
- /* We're already on the usermode stack at this point, but still
- with the kernel gs, so we can easily switch back */
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
@@ -98,28 +102,27 @@ ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
- Xen handles syscall callbacks much like ordinary exceptions,
- which means we have:
- - kernel gs
- - kernel rsp
- - an iret-like stack frame on the stack (including rcx and r11):
- ss
- rsp
- rflags
- cs
- rip
- r11
- rsp-> rcx
-
- In all the entrypoints, we undo all that to make it look
- like a CPU-generated syscall/sysenter and jump to the normal
- entrypoint.
+ * Xen handles syscall callbacks much like ordinary exceptions, which
+ * means we have:
+ * - kernel gs
+ * - kernel rsp
+ * - an iret-like stack frame on the stack (including rcx and r11):
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip
+ * r11
+ * rsp->rcx
+ *
+ * In all the entrypoints, we undo all that to make it look like a
+ * CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
.macro undo_xen_syscall
- mov 0*8(%rsp),%rcx
- mov 1*8(%rsp),%r11
- mov 5*8(%rsp),%rsp
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
.endm
/* Normal 64-bit system call target */
@@ -146,7 +149,7 @@ ENDPROC(xen_sysenter_target)
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
- lea 16(%rsp), %rsp /* strip %rcx,%r11 */
+ lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $VGCF_in_syscall
jmp hypercall_iret