diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 28 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/microcode/amd.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_ds.c | 57 | ||||
-rw-r--r-- | arch/x86/kernel/espfix_64.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 35 | ||||
-rw-r--r-- | arch/x86/kernel/irqinit.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 25 | ||||
-rw-r--r-- | arch/x86/kernel/paravirt_patch_64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 7 | ||||
-rw-r--r-- | arch/x86/kernel/tracepoint.c | 2 |
12 files changed, 150 insertions, 29 deletions
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa1e7246b06b..cc154ac64f00 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = { static const struct cpu_dev *this_cpu = &default_cpu; -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 /* * We need valid kernel segments for data and code in long mode too @@ -324,8 +324,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) static void setup_pcid(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PCID)) { - if (cpu_has(c, X86_FEATURE_PGE)) { + if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { cr4_set_bits(X86_CR4_PCIDE); + /* + * INVPCID has two "groups" of types: + * 1/2: Invalidate an individual address + * 3/4: Invalidate all contexts + * + * 1/2 take a PCID, but 3/4 do not. So, 3/4 + * ignore the PCID argument in the descriptor. + * But, we have to be careful not to call 1/2 + * with an actual non-zero PCID in them before + * we do the above cr4_set_bits(). + */ + if (cpu_has(c, X86_FEATURE_INVPCID)) + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); } else { /* * flush_tlb_all(), as currently implemented, won't @@ -338,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_PCID); } } + kaiser_setup_pcid(); } /* @@ -1229,7 +1243,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ @@ -1392,6 +1406,14 @@ void cpu_init(void) * try to read it. */ cr4_init_shadow(); + if (!kaiser_enabled) { + /* + * secondary_startup_64() deferred setting PGE in cr4: + * probe_page_size_mask() sets it on the boot cpu, + * but it needs to be set on each secondary cpu. + */ + cr4_set_bits(X86_CR4_PGE); + } /* * Load microcode on this cpu if a valid microcode is available. diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 2233f8a76615..2a0f44d225fe 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -580,6 +580,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 +#define F17H_MPB_MAX_SIZE 3200 switch (family) { case 0x14: @@ -591,6 +592,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, case 0x16: max_size = F16H_MPB_MAX_SIZE; break; + case 0x17: + max_size = F17H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 1e7de3cefc9c..f01b3a12dce0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -2,11 +2,15 @@ #include <linux/types.h> #include <linux/slab.h> +#include <asm/kaiser.h> #include <asm/perf_event.h> #include <asm/insn.h> #include "perf_event.h" +static +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu) static DEFINE_PER_CPU(void *, insn_buffer); +static void *dsalloc(size_t size, gfp_t flags, int node) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + unsigned int order = get_order(size); + struct page *page; + unsigned long addr; + + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); + if (!page) + return NULL; + addr = (unsigned long)page_address(page); + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { + __free_pages(page, order); + addr = 0; + } + return (void *)addr; +#else + return kmalloc_node(size, flags | __GFP_ZERO, node); +#endif +} + +static void dsfree(const void *buffer, size_t size) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (!buffer) + return; + kaiser_remove_mapping((unsigned long)buffer, size); + free_pages((unsigned long)buffer, get_order(size)); +#else + kfree(buffer); +#endif +} + static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); + buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu) if (x86_pmu.intel_cap.pebs_format < 2) { ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); if (!ibuffer) { - kfree(buffer); + dsfree(buffer, x86_pmu.pebs_buffer_size); return -ENOMEM; } per_cpu(insn_buffer, cpu) = ibuffer; @@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu) kfree(per_cpu(insn_buffer, cpu)); per_cpu(insn_buffer, cpu) = NULL; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + dsfree((void *)(unsigned long)ds->pebs_buffer_base, + x86_pmu.pebs_buffer_size); ds->pebs_buffer_base = 0; } @@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); if (unlikely(!buffer)) { WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); return -ENOMEM; @@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu) if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; return 0; @@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu) return; per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 4d38416e2a7f..b02cb2ec6726 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -41,6 +41,7 @@ #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/espfix.h> +#include <asm/kaiser.h> /* * Note: we only need 6*8 = 48 bytes for the espfix stack, but round @@ -126,6 +127,15 @@ void __init init_espfix_bsp(void) /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + /* + * Just copy the top-level PGD that is mapping the espfix + * area to ensure it is mapped into the shadow user page + * tables. + */ + if (kaiser_enabled) { + set_pgd(native_get_shadow_pgd(pgd_p), + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); + } /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ffdc0e860390..4034e905741a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -183,8 +183,8 @@ ENTRY(secondary_startup_64) movq $(init_level4_pgt - __START_KERNEL_map), %rax 1: - /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ + movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ @@ -441,6 +441,27 @@ early_idt_ripmsg: .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define KAISER_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define KAISER_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -450,9 +471,10 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_level4_pgt) +NEXT_PGD_PAGE(early_level4_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts) .data #ifndef CONFIG_XEN -NEXT_PAGE(init_level4_pgt) +NEXT_PGD_PAGE(init_level4_pgt) .fill 512,8,0 + .fill KAISER_USER_PGD_FILL,8,0 #else -NEXT_PAGE(init_level4_pgt) +NEXT_PGD_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt) */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #endif + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1423ab1b0312..f480b38a03c3 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -51,7 +51,7 @@ static struct irqaction irq2 = { .flags = IRQF_NO_THREAD, }; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2bd81e302427..ec1b06dc82d2 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock); static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; +struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) +{ + return hv_clock; +} + /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index d6279593bcdd..bc429365b72a 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <linux/kaiser.h> #include <asm/ldt.h> #include <asm/desc.h> @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) set_ldt(pc->ldt->entries, pc->ldt->size); } +static void __free_ldt_struct(struct ldt_struct *ldt) +{ + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(ldt->entries); + else + free_page((unsigned long)ldt->entries); + kfree(ldt); +} + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ static struct ldt_struct *alloc_ldt_struct(int size) { struct ldt_struct *new_ldt; int alloc_size; + int ret; if (size > LDT_ENTRIES) return NULL; @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size) return NULL; } + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, + __PAGE_KERNEL); new_ldt->size = size; + if (ret) { + __free_ldt_struct(new_ldt); + return NULL; + } return new_ldt; } @@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (likely(!ldt)) return; + kaiser_remove_mapping((unsigned long)ldt->entries, + ldt->size * LDT_ENTRY_SIZE); paravirt_free_ldt(ldt->entries, ldt->size); - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); - else - free_page((unsigned long)ldt->entries); - kfree(ldt); + __free_ldt_struct(ldt); } /* diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 8aa05583bc42..0677bf8d3a42 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); @@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9f7c21c22477..7c5c5dc90ffa 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -39,7 +39,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { .x86_tss = { .sp0 = TOP_OF_INIT_STACK, #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e67b834279b2..bbaae4cf9e8e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,7 @@ #include <asm/alternative.h> #include <asm/prom.h> #include <asm/microcode.h> +#include <asm/kaiser.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + /* + * This needs to happen right after XENPV is set on xen and + * kaiser_enabled is checked below in cleanup_highmap(). + */ + kaiser_check_boottime_disable(); + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 1c113db9ed57..2bb5ee464df3 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -9,10 +9,12 @@ #include <linux/atomic.h> atomic_t trace_idt_ctr = ATOMIC_INIT(0); +__aligned(PAGE_SIZE) struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) trace_idt_table }; /* No need to be aligned, but done to keep all IDTs defined the same way. */ +__aligned(PAGE_SIZE) gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; static int trace_irq_vector_refcount; |