aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig131
-rw-r--r--arch/x86/Kconfig.cpu3
-rw-r--r--arch/x86/Kconfig.debug23
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/compressed/head_32.S3
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S6
-rw-r--r--arch/x86/boot/install.sh4
-rw-r--r--arch/x86/boot/setup.ld3
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c10
-rw-r--r--arch/x86/ia32/ia32entry.S43
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/a.out-core.h10
-rw-r--r--arch/x86/include/asm/acpi.h1
-rw-r--r--arch/x86/include/asm/agp.h4
-rw-r--r--arch/x86/include/asm/alternative-asm.h10
-rw-r--r--arch/x86/include/asm/alternative.h1
-rw-r--r--arch/x86/include/asm/amd_iommu.h15
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h38
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h54
-rw-r--r--arch/x86/include/asm/apic.h53
-rw-r--r--arch/x86/include/asm/apicdef.h8
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/bootparam.h13
-rw-r--r--arch/x86/include/asm/bug.h4
-rw-r--r--arch/x86/include/asm/cache.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h54
-rw-r--r--arch/x86/include/asm/calgary.h2
-rw-r--r--arch/x86/include/asm/checksum_32.h3
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h248
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h234
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/debugreg.h33
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/device.h5
-rw-r--r--arch/x86/include/asm/dma-mapping.h15
-rw-r--r--arch/x86/include/asm/do_timer.h16
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/fixmap.h3
-rw-r--r--arch/x86/include/asm/gart.h9
-rw-r--r--arch/x86/include/asm/hardirq.h6
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h73
-rw-r--r--arch/x86/include/asm/hw_irq.h30
-rw-r--r--arch/x86/include/asm/hypervisor.h2
-rw-r--r--arch/x86/include/asm/inat.h220
-rw-r--r--arch/x86/include/asm/inat_types.h29
-rw-r--r--arch/x86/include/asm/insn.h184
-rw-r--r--arch/x86/include/asm/io_apic.h7
-rw-r--r--arch/x86/include/asm/iomap.h9
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/irq.h4
-rw-r--r--arch/x86/include/asm/kvm.h10
-rw-r--r--arch/x86/include/asm/kvm_emulate.h (renamed from arch/x86/include/asm/kvm_x86_emulate.h)0
-rw-r--r--arch/x86/include/asm/kvm_host.h61
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/mce.h44
-rw-r--r--arch/x86/include/asm/mmu_context.h6
-rw-r--r--arch/x86/include/asm/mpspec.h63
-rw-r--r--arch/x86/include/asm/msr-index.h12
-rw-r--r--arch/x86/include/asm/mtrr.h6
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/include/asm/nops.h2
-rw-r--r--arch/x86/include/asm/paravirt.h79
-rw-r--r--arch/x86/include/asm/paravirt_types.h38
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pci.h7
-rw-r--r--arch/x86/include/asm/percpu.h9
-rw-r--r--arch/x86/include/asm/perf_event.h (renamed from arch/x86/include/asm/perf_counter.h)43
-rw-r--r--arch/x86/include/asm/pgtable.h10
-rw-r--r--arch/x86/include/asm/pgtable_types.h5
-rw-r--r--arch/x86/include/asm/processor.h48
-rw-r--r--arch/x86/include/asm/ptrace.h62
-rw-r--r--arch/x86/include/asm/setup.h49
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/string_32.h10
-rw-r--r--arch/x86/include/asm/swiotlb.h9
-rw-r--r--arch/x86/include/asm/syscall.h14
-rw-r--r--arch/x86/include/asm/system.h31
-rw-r--r--arch/x86/include/asm/time.h53
-rw-r--r--arch/x86/include/asm/timer.h14
-rw-r--r--arch/x86/include/asm/topology.h25
-rw-r--r--arch/x86/include/asm/tsc.h3
-rw-r--r--arch/x86/include/asm/uaccess.h1
-rw-r--r--arch/x86/include/asm/uaccess_32.h29
-rw-r--r--arch/x86/include/asm/uaccess_64.h36
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h42
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h14
-rw-r--r--arch/x86/include/asm/vgtod.h1
-rw-r--r--arch/x86/include/asm/vmware.h2
-rw-r--r--arch/x86/include/asm/vmx.h8
-rw-r--r--arch/x86/include/asm/x86_init.h143
-rw-r--r--arch/x86/kernel/Makefile9
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/processor.c3
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S3
-rw-r--r--arch/x86/kernel/amd_iommu.c1245
-rw-r--r--arch/x86/kernel/amd_iommu_init.c118
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c74
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c15
-rw-r--r--arch/x86/kernel/apic/es7000_32.c16
-rw-r--r--arch/x86/kernel/apic/io_apic.c425
-rw-r--r--arch/x86/kernel/apic/nmi.c6
-rw-r--r--arch/x86/kernel/apic/numaq_32.c70
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/probe_64.c15
-rw-r--r--arch/x86/kernel/apic/summit_32.c12
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c27
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c9
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c127
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c46
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c19
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c14
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile5
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c116
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c159
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c462
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c163
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c109
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event.c (renamed from arch/x86/kernel/cpu/perf_counter.c)801
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/cpu/vmware.c27
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/crash_dump_32.c19
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/dumpstack_32.c1
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/e820.c25
-rw-r--r--arch/x86/kernel/early_printk.c788
-rw-r--r--arch/x86/kernel/efi.c6
-rw-r--r--arch/x86/kernel/entry_32.S31
-rw-r--r--arch/x86/kernel/entry_64.S51
-rw-r--r--arch/x86/kernel/ftrace.c84
-rw-r--r--arch/x86/kernel/head32.c26
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/head_64.S8
-rw-r--r--arch/x86/kernel/hw_breakpoint.c555
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c10
-rw-r--r--arch/x86/kernel/i8253.c19
-rw-r--r--arch/x86/kernel/init_task.c5
-rw-r--r--arch/x86/kernel/irq.c110
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c40
-rw-r--r--arch/x86/kernel/kgdb.c9
-rw-r--r--arch/x86/kernel/kprobes.c257
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/machine_kexec_32.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c6
-rw-r--r--arch/x86/kernel/microcode_core.c8
-rw-r--r--arch/x86/kernel/mpparse.c75
-rw-r--r--arch/x86/kernel/mrst.c24
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/paravirt.c36
-rw-r--r--arch/x86/kernel/pci-calgary_64.c94
-rw-r--r--arch/x86/kernel/pci-dma.c53
-rw-r--r--arch/x86/kernel/pci-gart_64.c157
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c21
-rw-r--r--arch/x86/kernel/process.c52
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c12
-rw-r--r--arch/x86/kernel/ptrace.c436
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c20
-rw-r--r--arch/x86/kernel/rtc.c17
-rw-r--r--arch/x86/kernel/setup.c145
-rw-r--r--arch/x86/kernel/setup_percpu.c364
-rw-r--r--arch/x86/kernel/sfi.c122
-rw-r--r--arch/x86/kernel/signal.c11
-rw-r--r--arch/x86/kernel/smpboot.c38
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c447
-rw-r--r--arch/x86/kernel/time.c121
-rw-r--r--arch/x86/kernel/time_32.c137
-rw-r--r--arch/x86/kernel/time_64.c135
-rw-r--r--arch/x86/kernel/tlb_uv.c9
-rw-r--r--arch/x86/kernel/trampoline.c12
-rw-r--r--arch/x86/kernel/trampoline_32.S8
-rw-r--r--arch/x86/kernel/trampoline_64.S7
-rw-r--r--arch/x86/kernel/traps.c86
-rw-r--r--arch/x86/kernel/tsc.c88
-rw-r--r--arch/x86/kernel/tsc_sync.c2
-rw-r--r--arch/x86/kernel/uv_irq.c239
-rw-r--r--arch/x86/kernel/visws_quirks.c62
-rw-r--r--arch/x86/kernel/vmi_32.c14
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S97
-rw-r--r--arch/x86/kernel/vsyscall_64.c11
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c5
-rw-r--r--arch/x86/kernel/x86_init.c83
-rw-r--r--arch/x86/kvm/Kconfig21
-rw-r--r--arch/x86/kvm/Makefile35
-rw-r--r--arch/x86/kvm/emulate.c (renamed from arch/x86/kvm/x86_emulate.c)265
-rw-r--r--arch/x86/kvm/i8254.c162
-rw-r--r--arch/x86/kvm/i8254.h5
-rw-r--r--arch/x86/kvm/i8259.c116
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/kvm_svm.h51
-rw-r--r--arch/x86/kvm/kvm_timer.h2
-rw-r--r--arch/x86/kvm/lapic.c338
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c665
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/mmutrace.h220
-rw-r--r--arch/x86/kvm/paging_tmpl.h159
-rw-r--r--arch/x86/kvm/svm.c914
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/trace.h355
-rw-r--r--arch/x86/kvm/vmx.c499
-rw-r--r--arch/x86/kvm/x86.c823
-rw-r--r--arch/x86/kvm/x86.h4
-rw-r--r--arch/x86/lguest/boot.c21
-rw-r--r--arch/x86/lib/.gitignore1
-rw-r--r--arch/x86/lib/Makefile17
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S57
-rw-r--r--arch/x86/lib/copy_user_64.S14
-rw-r--r--arch/x86/lib/inat.c90
-rw-r--r--arch/x86/lib/insn.c516
-rw-r--r--arch/x86/lib/usercopy_32.c10
-rw-r--r--arch/x86/lib/x86-opcode-map.txt893
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/extable.c31
-rw-r--r--arch/x86/mm/fault.c38
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init.c63
-rw-r--r--arch/x86/mm/init_32.c12
-rw-r--r--arch/x86/mm/init_64.c12
-rw-r--r--arch/x86/mm/iomap_32.c27
-rw-r--r--arch/x86/mm/ioremap.c114
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c3
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c1
-rw-r--r--arch/x86/mm/kmmio.c8
-rw-r--r--arch/x86/mm/mmap.c17
-rw-r--r--arch/x86/mm/pageattr.c30
-rw-r--r--arch/x86/mm/pat.c360
-rw-r--r--arch/x86/mm/physaddr.c70
-rw-r--r--arch/x86/mm/physaddr.h10
-rw-r--r--arch/x86/mm/setup_nx.c69
-rw-r--r--arch/x86/mm/srat_64.c4
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/mm/tlb.c15
-rw-r--r--arch/x86/oprofile/op_model_ppro.c4
-rw-r--r--arch/x86/oprofile/op_x86_model.h2
-rw-r--r--arch/x86/pci/amd_bus.c64
-rw-r--r--arch/x86/pci/common.c69
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c8
-rw-r--r--arch/x86/pci/mmconfig_32.c2
-rw-r--r--arch/x86/power/cpu.c32
-rw-r--r--arch/x86/tools/Makefile31
-rw-r--r--arch/x86/tools/chkobjdump.awk23
-rw-r--r--arch/x86/tools/distill.awk47
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk380
-rw-r--r--arch/x86/tools/test_get_len.c173
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vclock_gettime.c39
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/debugfs.c2
-rw-r--r--arch/x86/xen/enlighten.c177
-rw-r--r--arch/x86/xen/irq.c5
-rw-r--r--arch/x86/xen/mmu.c20
-rw-r--r--arch/x86/xen/mmu.h2
-rw-r--r--arch/x86/xen/smp.c1
-rw-r--r--arch/x86/xen/spinlock.c28
-rw-r--r--arch/x86/xen/xen-ops.h2
294 files changed, 13561 insertions, 7664 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fc20fdc0f7f..178084b4377 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,7 +24,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
- select HAVE_PERF_COUNTERS if (!M386 && !M486)
+ select HAVE_PERF_EVENTS if (!M386 && !M486)
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -49,6 +49,7 @@ config X86
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+ select HAVE_HW_BREAKPOINT
select HAVE_ARCH_KMEMCHECK
config OUTPUT_FORMAT
@@ -86,10 +87,6 @@ config STACKTRACE_SUPPORT
config HAVE_LATENCYTOP_SUPPORT
def_bool y
-config FAST_CMPXCHG_LOCAL
- bool
- default y
-
config MMU
def_bool y
@@ -150,7 +147,10 @@ config ARCH_HAS_CACHE_LINE_SIZE
config HAVE_SETUP_PER_CPU_AREA
def_bool y
-config HAVE_DYNAMIC_PER_CPU_AREA
+config NEED_PER_CPU_EMBED_FIRST_CHUNK
+ def_bool y
+
+config NEED_PER_CPU_PAGE_FIRST_CHUNK
def_bool y
config HAVE_CPUMASK_OF_CPU_MAP
@@ -179,6 +179,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
+config HAVE_INTEL_TXT
+ def_bool y
+ depends on EXPERIMENTAL && DMAR && ACPI
+
# Use the generic interrupt handling code in kernel/irq/:
config GENERIC_HARDIRQS
bool
@@ -318,6 +322,7 @@ config X86_EXTENDED_PLATFORM
SGI 320/540 (Visual Workstation)
Summit/EXA (IBM x440)
Unisys ES7000 IA32 series
+ Moorestown MID devices
If you have one of these systems, or if you want to build a
generic distribution kernel, say Y here - otherwise say N.
@@ -377,6 +382,18 @@ config X86_ELAN
If unsure, choose "PC-compatible" instead.
+config X86_MRST
+ bool "Moorestown MID platform"
+ depends on X86_32
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
+ Internet Device(MID) platform. Moorestown consists of two chips:
+ Lincroft (CPU core, graphics, and memory controller) and Langwell IOH.
+ Unlike standard x86 PCs, Moorestown does not have many legacy devices
+ nor standard legacy replacement devices/features. e.g. Moorestown does
+ not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+
config X86_RDC321X
bool "RDC R-321x SoC"
depends on X86_32
@@ -412,6 +429,17 @@ config X86_NUMAQ
of Flat Logical. You will need a new lynxer.elf file to flash your
firmware with - send email to <Martin.Bligh@us.ibm.com>.
+config X86_SUPPORTS_MEMORY_FAILURE
+ bool
+ # MCE code calls memory_failure():
+ depends on X86_MCE
+ # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
+ depends on !X86_NUMAQ
+ # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
+ depends on X86_64 || !SPARSEMEM
+ select ARCH_SUPPORTS_MEMORY_FAILURE
+ default y
+
config X86_VISWS
bool "SGI 320/540 (Visual Workstation)"
depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
@@ -464,7 +492,7 @@ if PARAVIRT_GUEST
source "arch/x86/xen/Kconfig"
config VMI
- bool "VMI Guest support"
+ bool "VMI Guest support (DEPRECATED)"
select PARAVIRT
depends on X86_32
---help---
@@ -473,6 +501,15 @@ config VMI
at the moment), by linking the kernel to a GPL-ed ROM module
provided by the hypervisor.
+ As of September 2009, VMware has started a phased retirement
+ of this feature from VMware's products. Please see
+ feature-removal-schedule.txt for details. If you are
+ planning to enable this option, please note that you cannot
+ live migrate a VMI enabled VM to a future VMware product,
+ which doesn't support VMI. So if you expect your kernel to
+ seamlessly migrate to newer VMware products, keep this
+ disabled.
+
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
@@ -776,41 +813,17 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
increased on these systems.
config X86_MCE
- bool "Machine Check Exception"
+ bool "Machine Check / overheating reporting"
---help---
- Machine Check Exception support allows the processor to notify the
- kernel if it detects a problem (e.g. overheating, component failure).
+ Machine Check support allows the processor to notify the
+ kernel if it detects a problem (e.g. overheating, data corruption).
The action the kernel takes depends on the severity of the problem,
- ranging from a warning message on the console, to halting the machine.
- Your processor must be a Pentium or newer to support this - check the
- flags in /proc/cpuinfo for mce. Note that some older Pentium systems
- have a design flaw which leads to false MCE events - hence MCE is
- disabled on all P5 processors, unless explicitly enabled with "mce"
- as a boot argument. Similarly, if MCE is built in and creates a
- problem on some new non-standard machine, you can boot with "nomce"
- to disable it. MCE support simply ignores non-MCE processors like
- the 386 and 486, so nearly everyone can say Y here.
-
-config X86_OLD_MCE
- depends on X86_32 && X86_MCE
- bool "Use legacy machine check code (will go away)"
- default n
- select X86_ANCIENT_MCE
- ---help---
- Use the old i386 machine check code. This is merely intended for
- testing in a transition period. Try this if you run into any machine
- check related software problems, but report the problem to
- linux-kernel. When in doubt say no.
-
-config X86_NEW_MCE
- depends on X86_MCE
- bool
- default y if (!X86_OLD_MCE && X86_32) || X86_64
+ ranging from warning messages to halting the machine.
config X86_MCE_INTEL
def_bool y
prompt "Intel MCE features"
- depends on X86_NEW_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC
---help---
Additional support for intel specific MCE features such as
the thermal monitor.
@@ -818,14 +831,14 @@ config X86_MCE_INTEL
config X86_MCE_AMD
def_bool y
prompt "AMD MCE features"
- depends on X86_NEW_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC
---help---
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
config X86_ANCIENT_MCE
def_bool n
- depends on X86_32
+ depends on X86_32 && X86_MCE
prompt "Support for old Pentium 5 / WinChip machine checks"
---help---
Include support for machine check handling on old Pentium 5 or WinChip
@@ -838,36 +851,16 @@ config X86_MCE_THRESHOLD
default y
config X86_MCE_INJECT
- depends on X86_NEW_MCE
+ depends on X86_MCE
tristate "Machine check injector support"
---help---
Provide support for injecting machine checks for testing purposes.
If you don't know what a machine check is and you don't do kernel
QA it is safe to say n.
-config X86_MCE_NONFATAL
- tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
- depends on X86_OLD_MCE
- ---help---
- Enabling this feature starts a timer that triggers every 5 seconds which
- will look at the machine check registers to see if anything happened.
- Non-fatal problems automatically get corrected (but still logged).
- Disable this if you don't want to see these messages.
- Seeing the messages this option prints out may be indicative of dying
- or out-of-spec (ie, overclocked) hardware.
- This option only does something on certain CPUs.
- (AMD Athlon/Duron and Intel Pentium 4)
-
-config X86_MCE_P4THERMAL
- bool "check for P4 thermal throttling interrupt."
- depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
- ---help---
- Enabling this feature will cause a message to be printed when the P4
- enters thermal throttling.
-
config X86_THERMAL_VECTOR
def_bool y
- depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
+ depends on X86_MCE_INTEL
config VM86
bool "Enable VM86 support" if EMBEDDED
@@ -1228,6 +1221,10 @@ config ARCH_DISCONTIGMEM_DEFAULT
def_bool y
depends on NUMA && X86_32
+config ARCH_PROC_KCORE_TEXT
+ def_bool y
+ depends on X86_64 && PROC_KCORE
+
config ARCH_SPARSEMEM_DEFAULT
def_bool y
depends on X86_64
@@ -1413,6 +1410,10 @@ config X86_PAT
If unsure, say Y.
+config ARCH_USES_PG_UNCACHED
+ def_bool y
+ depends on X86_PAT
+
config EFI
bool "EFI runtime service support"
depends on ACPI
@@ -1443,12 +1444,8 @@ config SECCOMP
If unsure, say Y. Only embedded should say N here.
-config CC_STACKPROTECTOR_ALL
- bool
-
config CC_STACKPROTECTOR
bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- select CC_STACKPROTECTOR_ALL
---help---
This option turns on the -fstack-protector GCC feature. This
feature puts, at the beginning of functions, a canary value on
@@ -1682,6 +1679,8 @@ source "kernel/power/Kconfig"
source "drivers/acpi/Kconfig"
+source "drivers/sfi/Kconfig"
+
config X86_APM_BOOT
bool
default y
@@ -1877,7 +1876,7 @@ config PCI_DIRECT
config PCI_MMCONFIG
def_bool y
- depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
+ depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY)
config PCI_OLPC
def_bool y
@@ -1915,7 +1914,7 @@ config DMAR_DEFAULT_ON
config DMAR_BROKEN_GFX_WA
def_bool n
prompt "Workaround broken graphics drivers (going away soon)"
- depends on DMAR
+ depends on DMAR && BROKEN
---help---
Current Graphics drivers tend to use physical address
for DMA and avoid using DMA APIs. Setting this config
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 979de294710..5e99762eb5c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -400,7 +400,7 @@ config X86_TSC
config X86_CMPXCHG64
def_bool y
- depends on X86_PAE || X86_64
+ depends on !M386 && !M486
# this should be set for all -march=.. options where the compiler
# generates cmov.
@@ -412,6 +412,7 @@ config X86_MINIMUM_CPU_FAMILY
int
default "64" if X86_64
default "6" if X86_32 && X86_P6_NOP
+ default "5" if X86_32 && X86_CMPXCHG64
default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
default "3"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6b..731318e5ac1 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
config HAVE_MMIOTRACE_SUPPORT
def_bool y
+config X86_DECODER_SELFTEST
+ bool "x86 instruction decoder selftest"
+ depends on DEBUG_KERNEL
+ ---help---
+ Perform x86 instruction decoder selftests at build time.
+ This option is useful for checking the sanity of x86 instruction
+ decoder code.
+ If unsure, say "N".
+
#
# IO delay types:
#
@@ -287,4 +296,18 @@ config OPTIMIZE_INLINING
If unsure, say N.
+config DEBUG_STRICT_USER_COPY_CHECKS
+ bool "Strict copy size checks"
+ depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
+ ---help---
+ Enabling this option turns a certain set of sanity checks for user
+ copy operations into compile time failures.
+
+ The copy_from_user() etc checks are there to help test if there
+ are sufficient security checks on the length argument of
+ the copy operation, by having gcc prove that the argument is
+ within bounds.
+
+ If unsure, or if you run an older (pre 4.4) gcc, say N.
+
endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 7983c420eaf..78b32be55e9 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -76,7 +76,6 @@ ifdef CONFIG_CC_STACKPROTECTOR
cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y)
stackp-y := -fstack-protector
- stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
KBUILD_CFLAGS += $(stackp-y)
else
$(warning stack protector enabled but no compiler support)
@@ -156,6 +155,9 @@ all: bzImage
KBUILD_IMAGE := $(boot)/bzImage
bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+ $(Q)$(MAKE) $(build)=arch/x86/tools posttest
+endif
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
@@ -179,8 +181,8 @@ archclean:
define archhelp
echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
echo ' install - Install kernel using'
- echo ' (your) ~/bin/installkernel or'
- echo ' (distribution) /sbin/installkernel or'
+ echo ' (your) ~/bin/$(INSTALLKERNEL) or'
+ echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
echo ' install to $$(INSTALL_PATH) and run lilo'
echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 75e4f001e70..f543b70ffae 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -23,13 +23,14 @@
*/
.text
+#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page_types.h>
#include <asm/boot.h>
#include <asm/asm-offsets.h>
- .section ".text.head","ax",@progbits
+ __HEAD
ENTRY(startup_32)
cld
/*
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f62c284db9e..077e1b69198 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -24,6 +24,7 @@
.code32
.text
+#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/pgtable_types.h>
@@ -33,7 +34,7 @@
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
- .section ".text.head"
+ __HEAD
.code32
ENTRY(startup_32)
cld
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index cc353e1b3ff..f4193bb4878 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,3 +1,5 @@
+#include <asm-generic/vmlinux.lds.h>
+
OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
#undef i386
@@ -18,9 +20,9 @@ SECTIONS
* address 0.
*/
. = 0;
- .text.head : {
+ .head.text : {
_head = . ;
- *(.text.head)
+ HEAD_TEXT
_ehead = . ;
}
.rodata.compressed : {
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 8d60ee15dfd..d13ec1c3864 100644
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -33,8 +33,8 @@ verify "$3"
# User may have a custom install script
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
# Default install - same as make zlilo
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 0f6ec455a2b..03c0683636b 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -53,6 +53,9 @@ SECTIONS
/DISCARD/ : { *(.note*) }
+ /*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
. = ASSERT(_end <= 0x8000, "Setup too big!");
. = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
/* Necessary for the very-old-loader check to work... */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 585edebe12c..49c552c060e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -82,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
return -EINVAL;
}
- if (irq_fpu_usable())
+ if (!irq_fpu_usable())
err = crypto_aes_expand_key(ctx, in_key, key_len);
else {
kernel_fpu_begin();
@@ -103,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
- if (irq_fpu_usable())
+ if (!irq_fpu_usable())
crypto_aes_encrypt_x86(ctx, dst, src);
else {
kernel_fpu_begin();
@@ -116,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
- if (irq_fpu_usable())
+ if (!irq_fpu_usable())
crypto_aes_decrypt_x86(ctx, dst, src);
else {
kernel_fpu_begin();
@@ -342,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req)
struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
- if (irq_fpu_usable()) {
+ if (!irq_fpu_usable()) {
struct ablkcipher_request *cryptd_req =
ablkcipher_request_ctx(req);
memcpy(cryptd_req, req, sizeof(*req));
@@ -363,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req)
struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
- if (irq_fpu_usable()) {
+ if (!irq_fpu_usable()) {
struct ablkcipher_request *cryptd_req =
ablkcipher_request_ctx(req);
memcpy(cryptd_req, req, sizeof(*req));
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ba331bfd111..581b0568fe1 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -21,8 +21,8 @@
#define __AUDIT_ARCH_LE 0x40000000
#ifndef CONFIG_AUDITSYSCALL
-#define sysexit_audit int_ret_from_sys_call
-#define sysretl_audit int_ret_from_sys_call
+#define sysexit_audit ia32_ret_from_sys_call
+#define sysretl_audit ia32_ret_from_sys_call
#endif
#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
@@ -39,12 +39,12 @@
.endm
/* clobbers %eax */
- .macro CLEAR_RREGS _r9=rax
+ .macro CLEAR_RREGS offset=0, _r9=rax
xorl %eax,%eax
- movq %rax,R11(%rsp)
- movq %rax,R10(%rsp)
- movq %\_r9,R9(%rsp)
- movq %rax,R8(%rsp)
+ movq %rax,\offset+R11(%rsp)
+ movq %rax,\offset+R10(%rsp)
+ movq %\_r9,\offset+R9(%rsp)
+ movq %rax,\offset+R8(%rsp)
.endm
/*
@@ -172,6 +172,10 @@ sysexit_from_sys_call:
movl RIP-R11(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
RESTORE_ARGS 1,24,1,1,1,1
+ xorq %r8,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
popfq
CFI_ADJUST_CFA_OFFSET -8
/*CFI_RESTORE rflags*/
@@ -200,9 +204,9 @@ sysexit_from_sys_call:
movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
.endm
- .macro auditsys_exit exit,ebpsave=RBP
+ .macro auditsys_exit exit
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
- jnz int_ret_from_sys_call
+ jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
sti
movl %eax,%esi /* second arg, syscall return value */
@@ -213,13 +217,13 @@ sysexit_from_sys_call:
call audit_syscall_exit
GET_THREAD_INFO(%r10)
movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
- movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
cli
TRACE_IRQS_OFF
testl %edi,TI_flags(%r10)
- jnz int_with_check
- jmp \exit
+ jz \exit
+ CLEAR_RREGS -ARGOFFSET
+ jmp int_with_check
.endm
sysenter_auditsys:
@@ -329,6 +333,9 @@ sysretl_from_sys_call:
CFI_REGISTER rip,rcx
movl EFLAGS-ARGOFFSET(%rsp),%r11d
/*CFI_REGISTER rflags,r11*/
+ xorq %r10,%r10
+ xorq %r9,%r9
+ xorq %r8,%r8
TRACE_IRQS_ON
movl RSP-ARGOFFSET(%rsp),%esp
CFI_RESTORE rsp
@@ -343,7 +350,7 @@ cstar_auditsys:
jmp cstar_dispatch
sysretl_audit:
- auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
+ auditsys_exit sysretl_from_sys_call
#endif
cstar_tracesys:
@@ -353,7 +360,7 @@ cstar_tracesys:
#endif
xchgl %r9d,%ebp
SAVE_REST
- CLEAR_RREGS r9
+ CLEAR_RREGS 0, r9
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
@@ -425,6 +432,8 @@ ia32_do_call:
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
movq %rax,RAX-ARGOFFSET(%rsp)
+ia32_ret_from_sys_call:
+ CLEAR_RREGS -ARGOFFSET
jmp int_ret_from_sys_call
ia32_tracesys:
@@ -442,8 +451,8 @@ END(ia32_syscall)
ia32_badsys:
movq $0,ORIG_RAX-ARGOFFSET(%rsp)
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
- jmp int_ret_from_sys_call
+ movq $-ENOSYS,%rax
+ jmp ia32_sysret
quiet_ni_syscall:
movq $-ENOSYS,%rax
@@ -831,5 +840,5 @@ ia32_sys_call_table:
.quad compat_sys_preadv
.quad compat_sys_pwritev
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
- .quad sys_perf_counter_open
+ .quad sys_perf_event_open
ia32_syscall_end:
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa..9f828f87ca3 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
header-y += sigcontext32.h
header-y += ucontext.h
header-y += processor-flags.h
+header-y += hw_breakpoint.h
unifdef-y += e820.h
unifdef-y += ist.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index bb70e397aa8..7a15588e45d 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,7 @@
#include <linux/user.h>
#include <linux/elfcore.h>
+#include <asm/debugreg.h>
/*
* fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
>> PAGE_SHIFT;
dump->u_dsize -= dump->u_tsize;
dump->u_ssize = 0;
- dump->u_debugreg[0] = current->thread.debugreg0;
- dump->u_debugreg[1] = current->thread.debugreg1;
- dump->u_debugreg[2] = current->thread.debugreg2;
- dump->u_debugreg[3] = current->thread.debugreg3;
- dump->u_debugreg[4] = 0;
- dump->u_debugreg[5] = 0;
- dump->u_debugreg[6] = current->thread.debugreg6;
- dump->u_debugreg[7] = current->thread.debugreg7;
+ aout_dump_debugregs(dump);
if (dump->start_stack < TASK_SIZE)
dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 20d1465a2ab..4518dc50090 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
#else /* !CONFIG_ACPI */
-#define acpi_disabled 1
#define acpi_lapic 0
#define acpi_ioapic 0
static inline void acpi_noirq_set(void) { }
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 9825cd64c9b..eec2a70d437 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -22,10 +22,6 @@
*/
#define flush_agp_cache() wbinvd()
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
/* GATT allocation. Returns/accepts GATT kernel virtual address. */
#define alloc_gatt_pages(order) \
((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e2077d343c3..b97f786a48d 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -1,17 +1,13 @@
#ifdef __ASSEMBLY__
-#ifdef CONFIG_X86_32
-# define X86_ALIGN .long
-#else
-# define X86_ALIGN .quad
-#endif
+#include <asm/asm.h>
#ifdef CONFIG_SMP
.macro LOCK_PREFIX
1: lock
.section .smp_locks,"a"
- .align 4
- X86_ALIGN 1b
+ _ASM_ALIGN
+ _ASM_PTR 1b
.previous
.endm
#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c240efc74e0..69b74a7b877 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -84,6 +84,7 @@ static inline void alternatives_smp_switch(int smp) {}
" .byte " __stringify(feature) "\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
+ " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
".previous\n" \
".section .altinstr_replacement, \"ax\"\n" \
"663:\n\t" newinstr "\n664:\n" /* replacement */ \
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index ac95995b7ba..5af2982133b 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -23,18 +23,13 @@
#include <linux/irqreturn.h>
#ifdef CONFIG_AMD_IOMMU
-extern int amd_iommu_init(void);
-extern int amd_iommu_init_dma_ops(void);
-extern int amd_iommu_init_passthrough(void);
+
extern void amd_iommu_detect(void);
-extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_flush_all_domains(void);
-extern void amd_iommu_flush_all_devices(void);
-extern void amd_iommu_shutdown(void);
+
#else
-static inline int amd_iommu_init(void) { return -ENODEV; }
+
static inline void amd_iommu_detect(void) { }
-static inline void amd_iommu_shutdown(void) { }
+
#endif
#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
new file mode 100644
index 00000000000..84786fb9a23
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
+#define _ASM_X86_AMD_IOMMU_PROTO_H
+
+struct amd_iommu;
+
+extern int amd_iommu_init_dma_ops(void);
+extern int amd_iommu_init_passthrough(void);
+extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_flush_all_domains(void);
+extern void amd_iommu_flush_all_devices(void);
+extern void amd_iommu_apply_erratum_63(u16 devid);
+extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
+
+#ifndef CONFIG_AMD_IOMMU_STATS
+
+static inline void amd_iommu_stats_init(void) { }
+
+#endif /* !CONFIG_AMD_IOMMU_STATS */
+
+#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 2a2cc7a78a8..ba19ad4c47d 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -25,6 +25,11 @@
#include <linux/spinlock.h>
/*
+ * Maximum number of IOMMUs supported
+ */
+#define MAX_IOMMUS 32
+
+/*
* some size calculation constants
*/
#define DEV_TABLE_ENTRY_SIZE 32
@@ -206,6 +211,9 @@ extern bool amd_iommu_dump;
printk(KERN_INFO "AMD-Vi: " format, ## arg); \
} while(0);
+/* global flag if IOMMUs cache non-present entries */
+extern bool amd_iommu_np_cache;
+
/*
* Make iterating over all IOMMUs easier
*/
@@ -226,6 +234,8 @@ extern bool amd_iommu_dump;
* independent of their use.
*/
struct protection_domain {
+ struct list_head list; /* for list of all protection domains */
+ struct list_head dev_list; /* List of all devices in this domain */
spinlock_t lock; /* mostly used to lock the page table*/
u16 id; /* the domain id written to the device table */
int mode; /* paging mode (0-6 levels) */
@@ -233,7 +243,20 @@ struct protection_domain {
unsigned long flags; /* flags to find out type of domain */
bool updated; /* complete domain flush required */
unsigned dev_cnt; /* devices assigned to this domain */
+ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
void *priv; /* private data */
+
+};
+
+/*
+ * This struct contains device specific data for the IOMMU
+ */
+struct iommu_dev_data {
+ struct list_head list; /* For domain->dev_list */
+ struct device *dev; /* Device this data belong to */
+ struct device *alias; /* The Alias Device */
+ struct protection_domain *domain; /* Domain the device is bound to */
+ atomic_t bind; /* Domain attach reverent count */
};
/*
@@ -291,6 +314,9 @@ struct dma_ops_domain {
struct amd_iommu {
struct list_head list;
+ /* Index within the IOMMU array */
+ int index;
+
/* locks the accesses to the hardware */
spinlock_t lock;
@@ -357,6 +383,21 @@ struct amd_iommu {
extern struct list_head amd_iommu_list;
/*
+ * Array with pointers to each IOMMU struct
+ * The indices are referenced in the protection domains
+ */
+extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
+
+/* Number of IOMMUs present in the system */
+extern int amd_iommus_present;
+
+/*
+ * Declarations for the global list of all protection domains
+ */
+extern spinlock_t amd_iommu_pd_lock;
+extern struct list_head amd_iommu_pd_list;
+
+/*
* Structure defining one entry in the device table
*/
struct dev_table_entry {
@@ -416,15 +457,9 @@ extern unsigned amd_iommu_aperture_order;
/* largest PCI device id we expect translation requests for */
extern u16 amd_iommu_last_bdf;
-/* data structures for protection domain handling */
-extern struct protection_domain **amd_iommu_pd_table;
-
/* allocation bitmap for domain ids */
extern unsigned long *amd_iommu_pd_alloc_bitmap;
-/* will be 1 if device isolation is enabled */
-extern bool amd_iommu_isolate;
-
/*
* If true, the addresses will be flushed on unmap time, not when
* they are reused
@@ -462,11 +497,6 @@ struct __iommu_counter {
#define ADD_STATS_COUNTER(name, x)
#define SUB_STATS_COUNTER(name, x)
-static inline void amd_iommu_stats_init(void) { }
-
#endif /* CONFIG_AMD_IOMMU_STATS */
-/* some function prototypes */
-extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
-
#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 586b7adb8e5..b4ac2cdcb64 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -66,13 +66,23 @@ static inline void default_inquire_remote_apic(int apicid)
}
/*
+ * With 82489DX we can't rely on apic feature bit
+ * retrieved via cpuid but still have to deal with
+ * such an apic chip so we assume that SMP configuration
+ * is found from MP table (64bit case uses ACPI mostly
+ * which set smp presence flag as well so we are safe
+ * to use this helper too).
+ */
+static inline bool apic_from_smp_config(void)
+{
+ return smp_found_config && !disable_apic;
+}
+
+/*
* Basic functions accessing APICs.
*/
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
-#else
-#define setup_boot_clock setup_boot_APIC_clock
-#define setup_secondary_clock setup_secondary_APIC_clock
#endif
#ifdef CONFIG_X86_64
@@ -252,6 +262,8 @@ static inline void lapic_shutdown(void) { }
static inline void init_apic_mappings(void) { }
static inline void disable_local_APIC(void) { }
static inline void apic_disable(void) { }
+# define setup_boot_APIC_clock x86_init_noop
+# define setup_secondary_APIC_clock x86_init_noop
#endif /* !CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_64
@@ -285,22 +297,22 @@ struct apic {
int disable_esr;
int dest_logical;
- unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
+ unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
unsigned long (*check_apicid_present)(int apicid);
void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
void (*init_apic_ldr)(void);
- physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
+ void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
void (*setup_apic_routing)(void);
int (*multi_timer_check)(int apic, int irq);
int (*apicid_to_node)(int logical_apicid);
int (*cpu_to_logical_apicid)(int cpu);
int (*cpu_present_to_apicid)(int mps_cpu);
- physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
+ void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
void (*setup_portio_remap)(void);
- int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
+ int (*check_phys_apicid_present)(int phys_apicid);
void (*enable_apic_mode)(void);
int (*phys_pkg_id)(int cpuid_apic, int index_msb);
@@ -434,7 +446,7 @@ extern struct apic apic_x2apic_uv_x;
DECLARE_PER_CPU(int, x2apic_extra_bits);
extern int default_cpu_present_to_apicid(int mps_cpu);
-extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
+extern int default_check_phys_apicid_present(int phys_apicid);
#endif
static inline void default_wait_for_init_deassert(atomic_t *deassert)
@@ -476,6 +488,8 @@ static inline unsigned int read_apic_id(void)
extern void default_setup_apic_routing(void);
+extern struct apic apic_noop;
+
#ifdef CONFIG_X86_32
extern struct apic apic_default;
@@ -520,9 +534,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return (unsigned int)(mask1 & mask2 & mask3);
}
-static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
+static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
{
- return physid_isset(apicid, bitmap);
+ return physid_isset(apicid, *map);
}
static inline unsigned long default_check_apicid_present(int bit)
@@ -530,9 +544,9 @@ static inline unsigned long default_check_apicid_present(int bit)
return physid_isset(bit, phys_cpu_present_map);
}
-static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map)
+static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
- return phys_map;
+ *retmap = *phys_map;
}
/* Mapping from cpu number to logical apicid */
@@ -550,9 +564,9 @@ static inline int __default_cpu_present_to_apicid(int mps_cpu)
}
static inline int
-__default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+__default_check_phys_apicid_present(int phys_apicid)
{
- return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
+ return physid_isset(phys_apicid, phys_cpu_present_map);
}
#ifdef CONFIG_X86_32
@@ -562,20 +576,15 @@ static inline int default_cpu_present_to_apicid(int mps_cpu)
}
static inline int
-default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+default_check_phys_apicid_present(int phys_apicid)
{
- return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+ return __default_check_phys_apicid_present(phys_apicid);
}
#else
extern int default_cpu_present_to_apicid(int mps_cpu);
-extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
+extern int default_check_phys_apicid_present(int phys_apicid);
#endif
-static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
-{
- return physid_mask_of_physid(phys_apicid);
-}
-
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7386bfa4f4b..7fe3b3060f0 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -11,10 +11,17 @@
#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
#define APIC_DEFAULT_PHYS_BASE 0xfee00000
+/*
+ * This is the IO-APIC register space as specified
+ * by Intel docs:
+ */
+#define IO_APIC_SLOT_SIZE 1024
+
#define APIC_ID 0x20
#define APIC_LVR 0x30
#define APIC_LVR_MASK 0xFF00FF
+#define APIC_LVR_DIRECTED_EOI (1 << 24)
#define GET_APIC_VERSION(x) ((x) & 0xFFu)
#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
#ifdef CONFIG_X86_32
@@ -41,6 +48,7 @@
#define APIC_DFR_CLUSTER 0x0FFFFFFFul
#define APIC_DFR_FLAT 0xFFFFFFFFul
#define APIC_SPIV 0xF0
+#define APIC_SPIV_DIRECTED_EOI (1 << 12)
#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
#define APIC_SPIV_APIC_ENABLED (1 << 8)
#define APIC_ISR 0x100
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
deleted file mode 100644
index 82f613c607c..00000000000
--- a/arch/x86/include/asm/apicnum.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_X86_APICNUM_H
-#define _ASM_X86_APICNUM_H
-
-/* define MAX_IO_APICS */
-#ifdef CONFIG_X86_32
-# define MAX_IO_APICS 64
-#else
-# define MAX_IO_APICS 128
-# define MAX_LOCAL_APIC 32768
-#endif
-
-#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 1724e8de317..6be33d83c71 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -85,7 +85,8 @@ struct efi_info {
struct boot_params {
struct screen_info screen_info; /* 0x000 */
struct apm_bios_info apm_bios_info; /* 0x040 */
- __u8 _pad2[12]; /* 0x054 */
+ __u8 _pad2[4]; /* 0x054 */
+ __u64 tboot_addr; /* 0x058 */
struct ist_info ist_info; /* 0x060 */
__u8 _pad3[16]; /* 0x070 */
__u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
@@ -109,4 +110,14 @@ struct boot_params {
__u8 _pad9[276]; /* 0xeec */
} __attribute__((packed));
+enum {
+ X86_SUBARCH_PC = 0,
+ X86_SUBARCH_LGUEST,
+ X86_SUBARCH_XEN,
+ X86_SUBARCH_MRST,
+ X86_NR_SUBARCHS,
+};
+
+
+
#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index d9cf1cd156d..f654d1bb17f 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -22,14 +22,14 @@ do { \
".popsection" \
: : "i" (__FILE__), "i" (__LINE__), \
"i" (sizeof(struct bug_entry))); \
- for (;;) ; \
+ unreachable(); \
} while (0)
#else
#define BUG() \
do { \
asm volatile("ud2"); \
- for (;;) ; \
+ unreachable(); \
} while (0)
#endif
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 5d367caa0e3..549860d3be8 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_CACHE_H
#define _ASM_X86_CACHE_H
+#include <linux/linkage.h>
+
/* L1 cache line size */
#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
@@ -13,7 +15,7 @@
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp \
__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
- __attribute__((__section__(".data.page_aligned")))
+ __page_aligned_data
#endif
#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e55dfc1ad45..b54f6afe7ec 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
memcpy(dst, src, len);
}
-#define PG_non_WB PG_arch_1
-PAGEFLAG(NonWB, non_WB)
+#define PG_WC PG_arch_1
+PAGEFLAG(WC, WC)
+
+#ifdef CONFIG_X86_PAT
+/*
+ * X86 PAT uses page flags WC and Uncached together to keep track of
+ * memory type of pages that have backing page struct. X86 PAT supports 3
+ * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and
+ * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
+ * been changed from its default (value of -1 used to denote this).
+ * Note we do not support _PAGE_CACHE_UC here.
+ *
+ * Caller must hold memtype_lock for atomicity.
+ */
+static inline unsigned long get_page_memtype(struct page *pg)
+{
+ if (!PageUncached(pg) && !PageWC(pg))
+ return -1;
+ else if (!PageUncached(pg) && PageWC(pg))
+ return _PAGE_CACHE_WC;
+ else if (PageUncached(pg) && !PageWC(pg))
+ return _PAGE_CACHE_UC_MINUS;
+ else
+ return _PAGE_CACHE_WB;
+}
+
+static inline void set_page_memtype(struct page *pg, unsigned long memtype)
+{
+ switch (memtype) {
+ case _PAGE_CACHE_WC:
+ ClearPageUncached(pg);
+ SetPageWC(pg);
+ break;
+ case _PAGE_CACHE_UC_MINUS:
+ SetPageUncached(pg);
+ ClearPageWC(pg);
+ break;
+ case _PAGE_CACHE_WB:
+ SetPageUncached(pg);
+ SetPageWC(pg);
+ break;
+ default:
+ case -1:
+ ClearPageUncached(pg);
+ ClearPageWC(pg);
+ break;
+ }
+}
+#else
+static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
+static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
+#endif
/*
* The set_memory_* API can be used to change various attributes of a virtual
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb62aa..0918654305a 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@ struct cal_chipset_ops {
extern int use_calgary;
#ifdef CONFIG_CALGARY_IOMMU
-extern int calgary_iommu_init(void);
extern void detect_calgary(void);
#else
-static inline int calgary_iommu_init(void) { return 1; }
static inline void detect_calgary(void) { return; }
#endif
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 7c5ef8b14d9..46fc474fd81 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -161,7 +161,8 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
"adcl $0, %0 ;\n"
: "=&r" (sum)
: "r" (saddr), "r" (daddr),
- "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
+ "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)
+ : "memory");
return csum_fold(sum);
}
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 82ceb788a98..ffb9bb6b6c3 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -8,14 +8,50 @@
* you need to test for the feature in boot_cpu_data.
*/
-#define xchg(ptr, v) \
- ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))
+extern void __xchg_wrong_size(void);
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ * but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
struct __xchg_dummy {
unsigned long a[100];
};
#define __xg(x) ((struct __xchg_dummy *)(x))
+#define __xchg(x, ptr, size) \
+({ \
+ __typeof(*(ptr)) __x = (x); \
+ switch (size) { \
+ case 1: \
+ asm volatile("xchgb %b0,%1" \
+ : "=q" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile("xchgw %w0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile("xchgl %0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ default: \
+ __xchg_wrong_size(); \
+ } \
+ __x; \
+})
+
+#define xchg(ptr, v) \
+ __xchg((v), (ptr), sizeof(*ptr))
+
/*
* The semantics of XCHGCMP8B are a bit strange, this is why
* there is a loop and the loading of %%eax and %%edx has to
@@ -71,57 +107,63 @@ static inline void __set_64bit_var(unsigned long long *ptr,
(unsigned int)((value) >> 32)) \
: __set_64bit(ptr, ll_low((value)), ll_high((value))))
-/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
- */
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
- int size)
-{
- switch (size) {
- case 1:
- asm volatile("xchgb %b0,%1"
- : "=q" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 2:
- asm volatile("xchgw %w0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 4:
- asm volatile("xchgl %0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- }
- return x;
-}
+extern void __cmpxchg_wrong_size(void);
/*
* Atomic compare and exchange. Compare OLD with MEM, if identical,
* store NEW in MEM. Return the initial value in MEM. Success is
* indicated by comparing RETURN with OLD.
*/
+#define __raw_cmpxchg(ptr, old, new, size, lock) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ switch (size) { \
+ case 1: \
+ asm volatile(lock "cmpxchgb %b1,%2" \
+ : "=a"(__ret) \
+ : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile(lock "cmpxchgw %w1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile(lock "cmpxchgl %1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ __ret; \
+})
+
+#define __cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define __sync_cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
+
+#define __cmpxchg_local(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "")
#ifdef CONFIG_X86_CMPXCHG
#define __HAVE_ARCH_CMPXCHG 1
-#define cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define sync_cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define cmpxchg_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
+
+#define cmpxchg(ptr, old, new) \
+ __cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define sync_cmpxchg(ptr, old, new) \
+ __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define cmpxchg_local(ptr, old, new) \
+ __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
#endif
#ifdef CONFIG_X86_CMPXCHG64
@@ -133,94 +175,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
(unsigned long long)(n)))
#endif
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-
-/*
- * Always use locked operations when touching memory shared with a
- * hypervisor, since the system may be SMP even if the guest kernel
- * isn't.
- */
-static inline unsigned long __sync_cmpxchg(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("lock; cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("lock; cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("lock; cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-
-static inline unsigned long __cmpxchg_local(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-
static inline unsigned long long __cmpxchg64(volatile void *ptr,
unsigned long long old,
unsigned long long new)
@@ -312,19 +266,23 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
-#define cmpxchg64(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- if (likely(boot_cpu_data.x86 > 4)) \
- __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- else \
- __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- __ret; \
-})
+#define cmpxchg64(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (o); \
+ __typeof__(*(ptr)) __new = (n); \
+ alternative_io("call cmpxchg8b_emu", \
+ "lock; cmpxchg8b (%%esi)" , \
+ X86_FEATURE_CX8, \
+ "=A" (__ret), \
+ "S" ((ptr)), "0" (__old), \
+ "b" ((unsigned int)__new), \
+ "c" ((unsigned int)(__new>>32)) \
+ : "memory"); \
+ __ret; })
+
+
+
#define cmpxchg64_local(ptr, o, n) \
({ \
__typeof__(*(ptr)) __ret; \
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 52de72e0de8..485ae415fae 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,9 +3,6 @@
#include <asm/alternative.h> /* Provides LOCK_PREFIX */
-#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
- (ptr), sizeof(*(ptr))))
-
#define __xg(x) ((volatile long *)(x))
static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
@@ -15,167 +12,118 @@ static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
#define _set_64bit set_64bit
+extern void __xchg_wrong_size(void);
+extern void __cmpxchg_wrong_size(void);
+
/*
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway
* Note 2: xchg has side effect, so that attribute volatile is necessary,
* but generally the primitive is invalid, *ptr is output argument. --ANK
*/
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
- int size)
-{
- switch (size) {
- case 1:
- asm volatile("xchgb %b0,%1"
- : "=q" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 2:
- asm volatile("xchgw %w0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 4:
- asm volatile("xchgl %k0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 8:
- asm volatile("xchgq %0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- }
- return x;
-}
+#define __xchg(x, ptr, size) \
+({ \
+ __typeof(*(ptr)) __x = (x); \
+ switch (size) { \
+ case 1: \
+ asm volatile("xchgb %b0,%1" \
+ : "=q" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile("xchgw %w0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile("xchgl %k0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm volatile("xchgq %0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ default: \
+ __xchg_wrong_size(); \
+ } \
+ __x; \
+})
+
+#define xchg(ptr, v) \
+ __xchg((v), (ptr), sizeof(*ptr))
+
+#define __HAVE_ARCH_CMPXCHG 1
/*
* Atomic compare and exchange. Compare OLD with MEM, if identical,
* store NEW in MEM. Return the initial value in MEM. Success is
* indicated by comparing RETURN with OLD.
*/
+#define __raw_cmpxchg(ptr, old, new, size, lock) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ switch (size) { \
+ case 1: \
+ asm volatile(lock "cmpxchgb %b1,%2" \
+ : "=a"(__ret) \
+ : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile(lock "cmpxchgw %w1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile(lock "cmpxchgl %k1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm volatile(lock "cmpxchgq %1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ __ret; \
+})
-#define __HAVE_ARCH_CMPXCHG 1
+#define __cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 8:
- asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
+#define __sync_cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
-/*
- * Always use locked operations when touching memory shared with a
- * hypervisor, since the system may be SMP even if the guest kernel
- * isn't.
- */
-static inline unsigned long __sync_cmpxchg(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("lock; cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("lock; cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("lock; cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
+#define __cmpxchg_local(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "")
-static inline unsigned long __cmpxchg_local(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("cmpxchgl %k1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 8:
- asm volatile("cmpxchgq %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
+#define cmpxchg(ptr, old, new) \
+ __cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define sync_cmpxchg(ptr, old, new) \
+ __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define cmpxchg_local(ptr, old, new) \
+ __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
-#define cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), sizeof(*(ptr))))
#define cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
cmpxchg((ptr), (o), (n)); \
})
-#define cmpxchg_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define sync_cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
+
#define cmpxchg64_local(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 847fee6493a..9cfc88b9774 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -96,6 +96,7 @@
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37be9e..8240f76b531 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -18,6 +18,7 @@
#define DR_TRAP1 (0x2) /* db1 */
#define DR_TRAP2 (0x4) /* db2 */
#define DR_TRAP3 (0x8) /* db3 */
+#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
#define DR_STEP (0x4000) /* single-step */
#define DR_SWITCH (0x8000) /* task switch */
@@ -49,6 +50,8 @@
#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
@@ -67,4 +70,34 @@
#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+DECLARE_PER_CPU(unsigned long, cpu_dr7);
+
+static inline void hw_breakpoint_disable(void)
+{
+ /* Zero the control register for HW Breakpoint */
+ set_debugreg(0UL, 7);
+
+ /* Zero-out the individual HW breakpoint address registers */
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+}
+
+static inline int hw_breakpoint_active(void)
+{
+ return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
+}
+
+extern void aout_dump_debugregs(struct user *dump);
+
+extern void hw_breakpoint_restore(void);
+
+#endif /* __KERNEL__ */
+
#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e8de2f6f5ca..617bd56b307 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -288,7 +288,7 @@ static inline void load_LDT(mm_context_t *pc)
static inline unsigned long get_desc_base(const struct desc_struct *desc)
{
- return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+ return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
}
static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 4994a20acbc..029f230ab63 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -8,9 +8,12 @@ struct dev_archdata {
#ifdef CONFIG_X86_64
struct dma_map_ops *dma_ops;
#endif
-#ifdef CONFIG_DMAR
+#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU)
void *iommu; /* hook for IOMMU specific extension */
#endif
};
+struct pdev_archdata {
+};
+
#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 0ee770d23d0..0f6c02f3b7d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -14,7 +14,14 @@
#include <asm/swiotlb.h>
#include <asm-generic/dma-coherent.h>
-extern dma_addr_t bad_dma_address;
+#ifdef CONFIG_ISA
+# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
+#else
+# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
+#endif
+
+#define DMA_ERROR_CODE 0
+
extern int iommu_merge;
extern struct device x86_dma_fallback_dev;
extern int panic_on_overflow;
@@ -42,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
if (ops->mapping_error)
return ops->mapping_error(dev, dma_addr);
- return (dma_addr == bad_dma_address);
+ return (dma_addr == DMA_ERROR_CODE);
}
#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -124,10 +131,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
return memory;
- if (!dev) {
+ if (!dev)
dev = &x86_dma_fallback_dev;
- gfp |= GFP_DMA;
- }
if (!is_device_dma_capable(dev))
return NULL;
diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h
deleted file mode 100644
index 23ecda0b28a..00000000000
--- a/arch/x86/include/asm/do_timer.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* defines for inline arch setup functions */
-#include <linux/clockchips.h>
-
-#include <asm/i8259.h>
-#include <asm/i8253.h>
-
-/**
- * do_timer_interrupt_hook - hook into timer tick
- *
- * Call the pit clock event handler. see asm/i8253.h
- **/
-
-static inline void do_timer_interrupt_hook(void)
-{
- global_clock_event->event_handler(global_clock_event);
-}
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 7ecba4d8508..40b4e614fe7 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -126,8 +126,6 @@ extern void e820_reserve_resources(void);
extern void e820_reserve_resources_late(void);
extern void setup_memory_map(void);
extern char *default_machine_specific_memory_setup(void);
-extern char *machine_specific_memory_setup(void);
-extern char *memory_setup(void);
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 83c1bc8d2e8..456a304b817 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -299,6 +299,8 @@ do { \
#ifdef CONFIG_X86_32
+#define STACK_RND_MASK (0x7ff)
+
#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index ff8cbfa0785..f5693c81a1d 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
#endif
@@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
#endif
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE
BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
#endif
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 7b2d71df39a..14f9890eb49 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -132,6 +132,9 @@ enum fixed_addresses {
#ifdef CONFIG_X86_32
FIX_WP_TEST,
#endif
+#ifdef CONFIG_INTEL_TXT
+ FIX_TBOOT_BASE,
+#endif
__end_of_fixed_addresses
};
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa409d..4ac5b0f33fc 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed;
extern int gart_iommu_aperture_disabled;
extern void early_gart_iommu_check(void);
-extern void gart_iommu_init(void);
-extern void gart_iommu_shutdown(void);
+extern int gart_iommu_init(void);
extern void __init gart_parse_options(char *);
extern void gart_iommu_hole_init(void);
@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void);
static inline void early_gart_iommu_check(void)
{
}
-static inline void gart_iommu_init(void)
-{
-}
-static inline void gart_iommu_shutdown(void)
-{
-}
static inline void gart_parse_options(char *options)
{
}
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 82e3e8f0104..108eb6fd1ae 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -20,11 +20,11 @@ typedef struct {
unsigned int irq_call_count;
unsigned int irq_tlb_count;
#endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
unsigned int irq_threshold_count;
-# endif
#endif
} ____cacheline_aligned irq_cpustat_t;
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 00000000000..0675a7c4c20
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
+#ifndef _I386_HW_BREAKPOINT_H
+#define _I386_HW_BREAKPOINT_H
+
+#ifdef __KERNEL__
+#define __ARCH_HW_BREAKPOINT_H
+
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
+struct arch_hw_breakpoint {
+ char *name; /* Contains name of the symbol to set bkpt */
+ unsigned long address;
+ u8 len;
+ u8 type;
+};
+
+#include <linux/kdebug.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
+
+/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_1 0x40
+#define X86_BREAKPOINT_LEN_2 0x44
+#define X86_BREAKPOINT_LEN_4 0x4c
+#define X86_BREAKPOINT_LEN_EXECUTE 0x40
+
+#ifdef CONFIG_X86_64
+#define X86_BREAKPOINT_LEN_8 0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define X86_BREAKPOINT_EXECUTE 0x80
+/* trigger on memory write */
+#define X86_BREAKPOINT_WRITE 0x81
+/* trigger on memory read or write */
+#define X86_BREAKPOINT_RW 0x83
+
+/* Total number of available HW breakpoint registers */
+#define HBP_NUM 4
+
+struct perf_event;
+struct pmu;
+
+extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
+ struct task_struct *tsk);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+ unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
+#endif /* __KERNEL__ */
+#endif /* _I386_HW_BREAKPOINT_H */
+
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ba180d93b08..6e124269fd4 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -79,14 +79,32 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
int ioapic, int ioapic_pin,
int trigger, int polarity)
{
- irq_attr->ioapic = ioapic;
- irq_attr->ioapic_pin = ioapic_pin;
- irq_attr->trigger = trigger;
- irq_attr->polarity = polarity;
+ irq_attr->ioapic = ioapic;
+ irq_attr->ioapic_pin = ioapic_pin;
+ irq_attr->trigger = trigger;
+ irq_attr->polarity = polarity;
}
-extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
- struct io_apic_irq_attr *irq_attr);
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * Most irqs are mapped 1:1 with pins.
+ */
+struct irq_cfg {
+ struct irq_pin_list *irq_2_pin;
+ cpumask_var_t domain;
+ cpumask_var_t old_domain;
+ u8 vector;
+ u8 move_in_progress : 1;
+};
+
+extern struct irq_cfg *irq_cfg(unsigned int);
+extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
+extern void send_cleanup_vector(struct irq_cfg *);
+
+struct irq_desc;
+extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
extern void setup_ioapic_dest(void);
extern void enable_IO_APIC(void);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 369f5c5d09a..b78c0941e42 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,7 +20,7 @@
#ifndef ASM_X86__HYPERVISOR_H
#define ASM_X86__HYPERVISOR_H
-extern unsigned long get_hypervisor_tsc_freq(void);
extern void init_hypervisor(struct cpuinfo_x86 *c);
+extern void init_hypervisor_platform(void);
#endif
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644
index 00000000000..205b063e3e3
--- /dev/null
+++ b/arch/x86/include/asm/inat.h
@@ -0,0 +1,220 @@
+#ifndef _ASM_X86_INAT_H
+#define _ASM_X86_INAT_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/inat_types.h>
+
+/*
+ * Internal bits. Don't use bitmasks directly, because these bits are
+ * unstable. You should use checking functions.
+ */
+
+#define INAT_OPCODE_TABLE_SIZE 256
+#define INAT_GROUP_TABLE_SIZE 8
+
+/* Legacy last prefixes */
+#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */
+#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */
+#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */
+/* Other Legacy prefixes */
+#define INAT_PFX_LOCK 4 /* 0xF0 */
+#define INAT_PFX_CS 5 /* 0x2E */
+#define INAT_PFX_DS 6 /* 0x3E */
+#define INAT_PFX_ES 7 /* 0x26 */
+#define INAT_PFX_FS 8 /* 0x64 */
+#define INAT_PFX_GS 9 /* 0x65 */
+#define INAT_PFX_SS 10 /* 0x36 */
+#define INAT_PFX_ADDRSZ 11 /* 0x67 */
+/* x86-64 REX prefix */
+#define INAT_PFX_REX 12 /* 0x4X */
+/* AVX VEX prefixes */
+#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
+#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
+
+#define INAT_LSTPFX_MAX 3
+#define INAT_LGCPFX_MAX 11
+
+/* Immediate size */
+#define INAT_IMM_BYTE 1
+#define INAT_IMM_WORD 2
+#define INAT_IMM_DWORD 3
+#define INAT_IMM_QWORD 4
+#define INAT_IMM_PTR 5
+#define INAT_IMM_VWORD32 6
+#define INAT_IMM_VWORD 7
+
+/* Legacy prefix */
+#define INAT_PFX_OFFS 0
+#define INAT_PFX_BITS 4
+#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
+#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
+/* Escape opcodes */
+#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS)
+#define INAT_ESC_BITS 2
+#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1)
+#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS)
+/* Group opcodes (1-16) */
+#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS)
+#define INAT_GRP_BITS 5
+#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1)
+#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS)
+/* Immediates */
+#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS)
+#define INAT_IMM_BITS 3
+#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
+/* Flags */
+#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
+#define INAT_MODRM (1 << (INAT_FLAG_OFFS))
+#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1))
+#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2))
+#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3))
+#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4))
+#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
+#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
+/* Attribute making macros for attribute tables */
+#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
+#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
+#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
+#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
+
+/* Attribute search APIs */
+extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
+ insn_byte_t last_pfx,
+ insn_attr_t esc_attr);
+extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
+ insn_byte_t last_pfx,
+ insn_attr_t esc_attr);
+extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
+ insn_byte_t vex_m,
+ insn_byte_t vex_pp);
+
+/* Attribute checking functions */
+static inline int inat_is_legacy_prefix(insn_attr_t attr)
+{
+ attr &= INAT_PFX_MASK;
+ return attr && attr <= INAT_LGCPFX_MAX;
+}
+
+static inline int inat_is_address_size_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
+}
+
+static inline int inat_is_operand_size_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
+}
+
+static inline int inat_is_rex_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
+}
+
+static inline int inat_last_prefix_id(insn_attr_t attr)
+{
+ if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
+ return 0;
+ else
+ return attr & INAT_PFX_MASK;
+}
+
+static inline int inat_is_vex_prefix(insn_attr_t attr)
+{
+ attr &= INAT_PFX_MASK;
+ return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_vex3_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_escape(insn_attr_t attr)
+{
+ return attr & INAT_ESC_MASK;
+}
+
+static inline int inat_escape_id(insn_attr_t attr)
+{
+ return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
+}
+
+static inline int inat_is_group(insn_attr_t attr)
+{
+ return attr & INAT_GRP_MASK;
+}
+
+static inline int inat_group_id(insn_attr_t attr)
+{
+ return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
+}
+
+static inline int inat_group_common_attribute(insn_attr_t attr)
+{
+ return attr & ~INAT_GRP_MASK;
+}
+
+static inline int inat_has_immediate(insn_attr_t attr)
+{
+ return attr & INAT_IMM_MASK;
+}
+
+static inline int inat_immediate_size(insn_attr_t attr)
+{
+ return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
+}
+
+static inline int inat_has_modrm(insn_attr_t attr)
+{
+ return attr & INAT_MODRM;
+}
+
+static inline int inat_is_force64(insn_attr_t attr)
+{
+ return attr & INAT_FORCE64;
+}
+
+static inline int inat_has_second_immediate(insn_attr_t attr)
+{
+ return attr & INAT_SCNDIMM;
+}
+
+static inline int inat_has_moffset(insn_attr_t attr)
+{
+ return attr & INAT_MOFFSET;
+}
+
+static inline int inat_has_variant(insn_attr_t attr)
+{
+ return attr & INAT_VARIANT;
+}
+
+static inline int inat_accept_vex(insn_attr_t attr)
+{
+ return attr & INAT_VEXOK;
+}
+
+static inline int inat_must_vex(insn_attr_t attr)
+{
+ return attr & INAT_VEXONLY;
+}
+#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644
index 00000000000..cb3c20ce39c
--- /dev/null
+++ b/arch/x86/include/asm/inat_types.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_INAT_TYPES_H
+#define _ASM_X86_INAT_TYPES_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* Instruction attributes */
+typedef unsigned int insn_attr_t;
+typedef unsigned char insn_byte_t;
+typedef signed int insn_value_t;
+
+#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644
index 00000000000..96c2e0ad04c
--- /dev/null
+++ b/arch/x86/include/asm/insn.h
@@ -0,0 +1,184 @@
+#ifndef _ASM_X86_INSN_H
+#define _ASM_X86_INSN_H
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/* insn_attr_t is defined in inat.h */
+#include <asm/inat.h>
+
+struct insn_field {
+ union {
+ insn_value_t value;
+ insn_byte_t bytes[4];
+ };
+ /* !0 if we've run insn_get_xxx() for this field */
+ unsigned char got;
+ unsigned char nbytes;
+};
+
+struct insn {
+ struct insn_field prefixes; /*
+ * Prefixes
+ * prefixes.bytes[3]: last prefix
+ */
+ struct insn_field rex_prefix; /* REX prefix */
+ struct insn_field vex_prefix; /* VEX prefix */
+ struct insn_field opcode; /*
+ * opcode.bytes[0]: opcode1
+ * opcode.bytes[1]: opcode2
+ * opcode.bytes[2]: opcode3
+ */
+ struct insn_field modrm;
+ struct insn_field sib;
+ struct insn_field displacement;
+ union {
+ struct insn_field immediate;
+ struct insn_field moffset1; /* for 64bit MOV */
+ struct insn_field immediate1; /* for 64bit imm or off16/32 */
+ };
+ union {
+ struct insn_field moffset2; /* for 64bit MOV */
+ struct insn_field immediate2; /* for 64bit imm or seg16 */
+ };
+
+ insn_attr_t attr;
+ unsigned char opnd_bytes;
+ unsigned char addr_bytes;
+ unsigned char length;
+ unsigned char x86_64;
+
+ const insn_byte_t *kaddr; /* kernel address of insn to analyze */
+ const insn_byte_t *next_byte;
+};
+
+#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
+
+#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
+#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
+#define X86_SIB_BASE(sib) ((sib) & 0x07)
+
+#define X86_REX_W(rex) ((rex) & 8)
+#define X86_REX_R(rex) ((rex) & 4)
+#define X86_REX_X(rex) ((rex) & 2)
+#define X86_REX_B(rex) ((rex) & 1)
+
+/* VEX bit flags */
+#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
+#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */
+#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */
+#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
+#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
+/* VEX bit fields */
+#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
+#define X86_VEX2_M 1 /* VEX2.M always 1 */
+#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
+
+/* The last prefix is needed for two-byte and three-byte opcodes */
+static inline insn_byte_t insn_last_prefix(struct insn *insn)
+{
+ return insn->prefixes.bytes[3];
+}
+
+extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
+extern void insn_get_prefixes(struct insn *insn);
+extern void insn_get_opcode(struct insn *insn);
+extern void insn_get_modrm(struct insn *insn);
+extern void insn_get_sib(struct insn *insn);
+extern void insn_get_displacement(struct insn *insn);
+extern void insn_get_immediate(struct insn *insn);
+extern void insn_get_length(struct insn *insn);
+
+/* Attribute will be determined after getting ModRM (for opcode groups) */
+static inline void insn_get_attribute(struct insn *insn)
+{
+ insn_get_modrm(insn);
+}
+
+/* Instruction uses RIP-relative addressing */
+extern int insn_rip_relative(struct insn *insn);
+
+/* Init insn for kernel text */
+static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
+{
+#ifdef CONFIG_X86_64
+ insn_init(insn, kaddr, 1);
+#else /* CONFIG_X86_32 */
+ insn_init(insn, kaddr, 0);
+#endif
+}
+
+static inline int insn_is_avx(struct insn *insn)
+{
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+ return (insn->vex_prefix.value != 0);
+}
+
+static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
+ return X86_VEX2_M;
+ else
+ return X86_VEX3_M(insn->vex_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
+ return X86_VEX_P(insn->vex_prefix.bytes[1]);
+ else
+ return X86_VEX_P(insn->vex_prefix.bytes[2]);
+}
+
+/* Offset of each field from kaddr */
+static inline int insn_offset_rex_prefix(struct insn *insn)
+{
+ return insn->prefixes.nbytes;
+}
+static inline int insn_offset_vex_prefix(struct insn *insn)
+{
+ return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
+}
+static inline int insn_offset_opcode(struct insn *insn)
+{
+ return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
+}
+static inline int insn_offset_modrm(struct insn *insn)
+{
+ return insn_offset_opcode(insn) + insn->opcode.nbytes;
+}
+static inline int insn_offset_sib(struct insn *insn)
+{
+ return insn_offset_modrm(insn) + insn->modrm.nbytes;
+}
+static inline int insn_offset_displacement(struct insn *insn)
+{
+ return insn_offset_sib(insn) + insn->sib.nbytes;
+}
+static inline int insn_offset_immediate(struct insn *insn)
+{
+ return insn_offset_displacement(insn) + insn->displacement.nbytes;
+}
+
+#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 85232d32fcb..7c7c16cde1f 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,6 +143,8 @@ extern int noioapicreroute;
/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
extern int timer_through_8259;
+extern void io_apic_disable_legacy(void);
+
/*
* If we use the IO-APIC for IRQ routing, disable automatic
* assignment of PCI IRQ's.
@@ -176,6 +178,7 @@ extern int setup_ioapic_entry(int apic, int irq,
int polarity, int vector, int pin);
extern void ioapic_write_entry(int apic, int pin,
struct IO_APIC_route_entry e);
+extern void setup_ioapic_ids_from_mpc(void);
struct mp_ioapic_gsi{
int gsi_base;
@@ -187,12 +190,14 @@ int mp_find_ioapic_pin(int ioapic, int gsi);
void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
#else /* !CONFIG_X86_IO_APIC */
+
#define io_apic_assign_pci_irqs 0
+#define setup_ioapic_ids_from_mpc x86_init_noop
static const int timer_through_8259 = 0;
static inline void ioapic_init_mappings(void) { }
static inline void ioapic_insert_resources(void) { }
-
static inline void probe_nr_irqs_gsi(void) { }
+
#endif
#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 0e9fe1d9d97..f35eb45d657 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -26,13 +26,16 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
-int
-is_io_mapping_possible(resource_size_t base, unsigned long size);
-
void *
iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
void
iounmap_atomic(void *kvaddr, enum km_type type);
+int
+iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
+
+void
+iomap_free(resource_size_t base, unsigned long size);
+
#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21bbee6..345c99cef15 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,8 +1,6 @@
#ifndef _ASM_X86_IOMMU_H
#define _ASM_X86_IOMMU_H
-extern void pci_iommu_shutdown(void);
-extern void no_iommu_init(void);
extern struct dma_map_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index f38481bcd45..ffd700ff5dc 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -34,10 +34,10 @@ static inline int irq_canonicalize(int irq)
#ifdef CONFIG_HOTPLUG_CPU
#include <linux/cpumask.h>
extern void fixup_irqs(void);
+extern void irq_force_complete_move(int);
#endif
extern void (*generic_interrupt_extension)(void);
-extern void init_IRQ(void);
extern void native_init_IRQ(void);
extern bool handle_irq(unsigned irq, struct pt_regs *regs);
@@ -47,4 +47,6 @@ extern unsigned int do_IRQ(struct pt_regs *regs);
extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
extern int vector_used_by_percpu_irq(unsigned int vector);
+extern void init_ISA_irqs(void);
+
#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 125be8b1956..4a5fe914dc5 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -17,6 +17,8 @@
#define __KVM_HAVE_USER_NMI
#define __KVM_HAVE_GUEST_DEBUG
#define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -236,6 +238,14 @@ struct kvm_pit_state {
struct kvm_pit_channel_state channels[3];
};
+#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
+
+struct kvm_pit_state2 {
+ struct kvm_pit_channel_state channels[3];
+ __u32 flags;
+ __u32 reserved[9];
+};
+
struct kvm_reinject_control {
__u8 pit_reinject;
__u8 reserved[31];
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c42311..b7ed2c42311 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eabdc1cfab5..d83892226f7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
+#include <linux/tracepoint.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -37,12 +38,14 @@
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
-#define KVM_GUEST_CR0_MASK \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
- | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK \
+ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
#define KVM_VM_CR0_ALWAYS_ON \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
- | X86_CR0_MP)
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_GUEST_CR4_MASK \
(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -51,12 +54,12 @@
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
-/* shadow tables are PAE even on non-PAE hosts */
-#define KVM_HPAGE_SHIFT 21
-#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
-#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
-
-#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
+/* KVM Hugepage definitions for x86 */
+#define KVM_NR_PAGE_SIZES 3
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
#define DE_VECTOR 0
#define DB_VECTOR 1
@@ -120,6 +123,10 @@ enum kvm_reg {
NR_VCPU_REGS
};
+enum kvm_reg_ex {
+ VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+};
+
enum {
VCPU_SREG_ES,
VCPU_SREG_CS,
@@ -131,7 +138,7 @@ enum {
VCPU_SREG_LDTR,
};
-#include <asm/kvm_x86_emulate.h>
+#include <asm/kvm_emulate.h>
#define KVM_NR_MEM_OBJS 40
@@ -308,7 +315,6 @@ struct kvm_vcpu_arch {
struct {
gfn_t gfn; /* presumed gfn during guest pte update */
pfn_t pfn; /* pfn corresponding to that gfn */
- int largepage;
unsigned long mmu_seq;
} update_pte;
@@ -334,16 +340,6 @@ struct kvm_vcpu_arch {
u8 nr;
} interrupt;
- struct {
- int vm86_active;
- u8 save_iopl;
- struct kvm_save_segment {
- u16 selector;
- unsigned long base;
- u32 limit;
- u32 ar;
- } tr, es, ds, fs, gs;
- } rmode;
int halt_request; /* real mode on Intel only */
int cpuid_nent;
@@ -366,13 +362,15 @@ struct kvm_vcpu_arch {
u32 pat;
int switch_db_regs;
- unsigned long host_db[KVM_NR_DB_REGS];
- unsigned long host_dr6;
- unsigned long host_dr7;
unsigned long db[KVM_NR_DB_REGS];
unsigned long dr6;
unsigned long dr7;
unsigned long eff_db[KVM_NR_DB_REGS];
+
+ u64 mcg_cap;
+ u64 mcg_status;
+ u64 mcg_ctl;
+ u64 *mce_banks;
};
struct kvm_mem_alias {
@@ -409,6 +407,7 @@ struct kvm_arch{
struct page *ept_identity_pagetable;
bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
unsigned long irq_sources_bitmap;
unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
@@ -526,6 +525,9 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ bool (*gb_page_enable)(void);
+
+ const struct trace_print_flags *exit_reasons_str;
};
extern struct kvm_x86_ops *kvm_x86_ops;
@@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
u32 error_code);
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
-#define MSR_IA32_TIME_STAMP_COUNTER 0x010
-
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
@@ -795,6 +796,10 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index b8a3305ae09..c584076a47f 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_KVM_PARA_H
#define _ASM_X86_KVM_PARA_H
+#include <linux/types.h>
+
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
*/
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5cdd8d100ec..858baa061cf 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -9,7 +9,7 @@
*/
#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
-#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
+#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
@@ -38,6 +38,14 @@
#define MCM_ADDR_MEM 3 /* memory address */
#define MCM_ADDR_GENERIC 7 /* generic */
+#define MCJ_CTX_MASK 3
+#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
+#define MCJ_CTX_RANDOM 0 /* inject context: random */
+#define MCJ_CTX_PROCESS 1 /* inject context: process */
+#define MCJ_CTX_IRQ 2 /* inject context: IRQ */
+#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */
+#define MCJ_EXCEPTION 8 /* raise as exception */
+
/* Fields are zero when not available */
struct mce {
__u64 status;
@@ -48,8 +56,8 @@ struct mce {
__u64 tsc; /* cpu time stamp counter */
__u64 time; /* wall time_t when error was detected */
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
- __u8 pad1;
- __u16 pad2;
+ __u8 inject_flags; /* software inject flags */
+ __u16 pad;
__u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
@@ -100,6 +108,8 @@ struct mce_log {
#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
#ifdef __KERNEL__
#include <linux/percpu.h>
@@ -110,16 +120,11 @@ extern int mce_disabled;
extern int mce_p5_enabled;
#ifdef CONFIG_X86_MCE
-void mcheck_init(struct cpuinfo_x86 *c);
+int mcheck_init(void);
+void mcheck_cpu_init(struct cpuinfo_x86 *c);
#else
-static inline void mcheck_init(struct cpuinfo_x86 *c) {}
-#endif
-
-#ifdef CONFIG_X86_OLD_MCE
-extern int nr_mce_banks;
-void amd_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
+static inline int mcheck_init(void) { return 0; }
+static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
#endif
#ifdef CONFIG_X86_ANCIENT_MCE
@@ -132,15 +137,18 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
static inline void enable_p5_mce(void) {}
#endif
+extern void (*x86_mce_decode_callback)(struct mce *m);
+
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct sys_device, mce_dev);
/*
- * To support more than 128 would need to escape the predefined
- * Linux defined extended banks first.
+ * Maximum banks number.
+ * This is the limit of the current register layout on
+ * Intel CPUs.
*/
-#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+#define MAX_NR_BANKS 32
#ifdef CONFIG_X86_MCE_INTEL
extern int mce_cmci_disabled;
@@ -208,10 +216,12 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
void intel_init_thermal(struct cpuinfo_x86 *c);
-#ifdef CONFIG_X86_NEW_MCE
void mce_log_therm_throt_event(__u64 status);
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+extern void mcheck_intel_therm_init(void);
#else
-static inline void mce_log_therm_throt_event(__u64 status) {}
+static inline void mcheck_intel_therm_init(void) { }
#endif
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index f923203dc39..4a2d4e0c18d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
if (likely(prev != next)) {
/* stop flush ipis for the previous mm */
- cpu_clear(cpu, prev->cpu_vm_mask);
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
#ifdef CONFIG_SMP
percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
percpu_write(cpu_tlbstate.active_mm, next);
#endif
- cpu_set(cpu, next->cpu_vm_mask);
+ cpumask_set_cpu(cpu, mm_cpumask(next));
/* Re-load page tables */
load_cr3(next->pgd);
@@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
/* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index e2a1bb6d71e..61d90b1331c 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -4,6 +4,7 @@
#include <linux/init.h>
#include <asm/mpspec_def.h>
+#include <asm/x86_init.h>
extern int apic_version[MAX_APICS];
extern int pic_mode;
@@ -41,9 +42,6 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
#endif /* CONFIG_X86_64 */
-extern void early_find_smp_config(void);
-extern void early_get_smp_config(void);
-
#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
extern int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
@@ -52,20 +50,55 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
extern unsigned int boot_cpu_physical_apicid;
extern unsigned int max_physical_apicid;
-extern int smp_found_config;
extern int mpc_default_type;
extern unsigned long mp_lapic_addr;
-extern void get_smp_config(void);
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int smp_found_config;
+#else
+# define smp_found_config 0
+#endif
+
+static inline void get_smp_config(void)
+{
+ x86_init.mpparse.get_smp_config(0);
+}
+
+static inline void early_get_smp_config(void)
+{
+ x86_init.mpparse.get_smp_config(1);
+}
+
+static inline void find_smp_config(void)
+{
+ x86_init.mpparse.find_smp_config(1);
+}
+
+static inline void early_find_smp_config(void)
+{
+ x86_init.mpparse.find_smp_config(0);
+}
#ifdef CONFIG_X86_MPPARSE
-extern void find_smp_config(void);
extern void early_reserve_e820_mpc_new(void);
extern int enable_update_mptable;
+extern int default_mpc_apic_id(struct mpc_cpu *m);
+extern void default_smp_read_mpc_oem(struct mpc_table *mpc);
+# ifdef CONFIG_X86_IO_APIC
+extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
+# else
+# define default_mpc_oem_bus_info NULL
+# endif
+extern void default_find_smp_config(unsigned int reserve);
+extern void default_get_smp_config(unsigned int early);
#else
-static inline void find_smp_config(void) { }
static inline void early_reserve_e820_mpc_new(void) { }
#define enable_update_mptable 0
+#define default_mpc_apic_id NULL
+#define default_smp_read_mpc_oem NULL
+#define default_mpc_oem_bus_info NULL
+#define default_find_smp_config x86_init_uint_noop
+#define default_get_smp_config x86_init_uint_noop
#endif
void __cpuinit generic_processor_info(int apicid, int version);
@@ -130,14 +163,16 @@ typedef struct physid_mask physid_mask_t;
#define physids_shift_left(d, s, n) \
bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
-#define physids_coerce(map) ((map).mask[0])
+static inline unsigned long physids_coerce(physid_mask_t *map)
+{
+ return map->mask[0];
+}
-#define physids_promote(physids) \
- ({ \
- physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
- __physid_mask.mask[0] = physids; \
- __physid_mask; \
- })
+static inline void physids_promote(unsigned long physids, physid_mask_t *map)
+{
+ physids_clear(*map);
+ map->mask[0] = physids;
+}
/* Note: will create very large stack frames if physid_mask_t is big */
#define physid_mask_of_physid(physid) \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6be7fc254b5..4ffe09b2ad7 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,8 +81,15 @@
#define MSR_IA32_MC0_ADDR 0x00000402
#define MSR_IA32_MC0_MISC 0x00000403
+#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
+#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
+#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
+#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
+
/* These are consecutive and not in the normal 4er MCE bank block */
#define MSR_IA32_MC0_CTL2 0x00000280
+#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
+
#define CMCI_EN (1ULL << 30)
#define CMCI_THRESHOLD_MASK 0xffffULL
@@ -215,6 +222,10 @@
#define THERM_STATUS_PROCHOT (1 << 0)
+#define MSR_THERM2_CTL 0x0000019d
+
+#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16)
+
#define MSR_IA32_MISC_ENABLE 0x000001a0
/* MISC_ENABLE bits: architectural */
@@ -374,6 +385,7 @@
/* AMD-V MSRs */
#define MSR_VM_CR 0xc0010114
+#define MSR_VM_IGNNE 0xc0010115
#define MSR_VM_HSAVE_PA 0xc0010117
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index a51ada8467d..4365ffdb461 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -121,6 +121,9 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
extern void mtrr_ap_init(void);
extern void mtrr_bp_init(void);
+extern void set_mtrr_aps_delayed_init(void);
+extern void mtrr_aps_init(void);
+extern void mtrr_bp_restore(void);
extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
extern int amd_special_default_mtrr(void);
# else
@@ -161,6 +164,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
#define mtrr_ap_init() do {} while (0)
#define mtrr_bp_init() do {} while (0)
+#define set_mtrr_aps_delayed_init() do {} while (0)
+#define mtrr_aps_init() do {} while (0)
+#define mtrr_bp_restore() do {} while (0)
# endif
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index e63cf7d441e..139d4c1a33a 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog;
#define NMI_INVALID 3
struct ctl_table;
-struct file;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+extern int proc_nmi_enabled(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int unknown_nmi_panic;
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index ad2668ee1aa..6d8723a766c 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -65,6 +65,8 @@
6: osp nopl 0x00(%eax,%eax,1)
7: nopl 0x00000000(%eax)
8: nopl 0x00000000(%eax,%eax,1)
+ Note: All the above are assumed to be a single instruction.
+ There is kernel code that depends on this.
*/
#define P6_NOP1 GENERIC_NOP1
#define P6_NOP2 ".byte 0x66,0x90\n"
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 40d6586af25..efb38994859 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -24,22 +24,6 @@ static inline void load_sp0(struct tss_struct *tss,
PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
}
-#define ARCH_SETUP pv_init_ops.arch_setup();
-static inline unsigned long get_wallclock(void)
-{
- return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
-}
-
-static inline int set_wallclock(unsigned long nowtime)
-{
- return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
-}
-
-static inline void (*choose_time_init(void))(void)
-{
- return pv_time_ops.time_init;
-}
-
/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
@@ -245,7 +229,6 @@ static inline unsigned long long paravirt_sched_clock(void)
{
return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
}
-#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
static inline unsigned long long paravirt_read_pmc(int counter)
{
@@ -363,34 +346,6 @@ static inline void slow_down_io(void)
#endif
}
-#ifdef CONFIG_X86_LOCAL_APIC
-static inline void setup_boot_clock(void)
-{
- PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
-}
-
-static inline void setup_secondary_clock(void)
-{
- PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
-}
-#endif
-
-static inline void paravirt_post_allocator_init(void)
-{
- if (pv_init_ops.post_allocator_init)
- (*pv_init_ops.post_allocator_init)();
-}
-
-static inline void paravirt_pagetable_setup_start(pgd_t *base)
-{
- (*pv_mmu_ops.pagetable_setup_start)(base);
-}
-
-static inline void paravirt_pagetable_setup_done(pgd_t *base)
-{
- (*pv_mmu_ops.pagetable_setup_done)(base);
-}
-
#ifdef CONFIG_SMP
static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
unsigned long start_esp)
@@ -885,42 +840,22 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
static inline unsigned long __raw_local_save_flags(void)
{
- unsigned long f;
-
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- : "=a"(f)
- : paravirt_type(pv_irq_ops.save_fl),
- paravirt_clobber(CLBR_EAX)
- : "memory", "cc");
- return f;
+ return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
}
static inline void raw_local_irq_restore(unsigned long f)
{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- : "=a"(f)
- : PV_FLAGS_ARG(f),
- paravirt_type(pv_irq_ops.restore_fl),
- paravirt_clobber(CLBR_EAX)
- : "memory", "cc");
+ PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
}
static inline void raw_local_irq_disable(void)
{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- :
- : paravirt_type(pv_irq_ops.irq_disable),
- paravirt_clobber(CLBR_EAX)
- : "memory", "eax", "cc");
+ PVOP_VCALLEE0(pv_irq_ops.irq_disable);
}
static inline void raw_local_irq_enable(void)
{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- :
- : paravirt_type(pv_irq_ops.irq_enable),
- paravirt_clobber(CLBR_EAX)
- : "memory", "eax", "cc");
+ PVOP_VCALLEE0(pv_irq_ops.irq_enable);
}
static inline unsigned long __raw_local_irq_save(void)
@@ -948,6 +883,8 @@ static inline unsigned long __raw_local_irq_save(void)
#undef PVOP_VCALL4
#undef PVOP_CALL4
+extern void default_banner(void);
+
#else /* __ASSEMBLY__ */
#define _PVSITE(ptype, clobbers, ops, word, algn) \
@@ -1088,5 +1025,7 @@ static inline unsigned long __raw_local_irq_save(void)
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_PARAVIRT */
+#else /* CONFIG_PARAVIRT */
+# define default_banner x86_init_noop
+#endif /* !CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 25402d0006e..9357473c8da 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -78,14 +78,6 @@ struct pv_init_ops {
*/
unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
unsigned long addr, unsigned len);
-
- /* Basic arch-specific setup */
- void (*arch_setup)(void);
- char *(*memory_setup)(void);
- void (*post_allocator_init)(void);
-
- /* Print a banner to identify the environment */
- void (*banner)(void);
};
@@ -96,12 +88,6 @@ struct pv_lazy_ops {
};
struct pv_time_ops {
- void (*time_init)(void);
-
- /* Set and set time of day */
- unsigned long (*get_wallclock)(void);
- int (*set_wallclock)(unsigned long);
-
unsigned long long (*sched_clock)(void);
unsigned long (*get_tsc_khz)(void);
};
@@ -203,8 +189,6 @@ struct pv_cpu_ops {
};
struct pv_irq_ops {
- void (*init_IRQ)(void);
-
/*
* Get/set interrupt state. save_fl and restore_fl are only
* expected to use X86_EFLAGS_IF; all other bits
@@ -229,9 +213,6 @@ struct pv_irq_ops {
struct pv_apic_ops {
#ifdef CONFIG_X86_LOCAL_APIC
- void (*setup_boot_clock)(void);
- void (*setup_secondary_clock)(void);
-
void (*startup_ipi_hook)(int phys_apicid,
unsigned long start_eip,
unsigned long start_esp);
@@ -239,15 +220,6 @@ struct pv_apic_ops {
};
struct pv_mmu_ops {
- /*
- * Called before/after init_mm pagetable setup. setup_start
- * may reset %cr3, and may pre-install parts of the pagetable;
- * pagetable setup is expected to preserve any existing
- * mapping.
- */
- void (*pagetable_setup_start)(pgd_t *pgd_base);
- void (*pagetable_setup_done)(pgd_t *pgd_base);
-
unsigned long (*read_cr2)(void);
void (*write_cr2)(unsigned long);
@@ -522,10 +494,11 @@ int paravirt_disable_iospace(void);
#define EXTRA_CLOBBERS
#define VEXTRA_CLOBBERS
#else /* CONFIG_X86_64 */
+/* [re]ax isn't an arg, but the return val */
#define PVOP_VCALL_ARGS \
unsigned long __edi = __edi, __esi = __esi, \
- __edx = __edx, __ecx = __ecx
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
+ __edx = __edx, __ecx = __ecx, __eax = __eax
+#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
@@ -537,6 +510,7 @@ int paravirt_disable_iospace(void);
"=c" (__ecx)
#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
+/* void functions are still allowed [re]ax for scratch */
#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
@@ -611,8 +585,8 @@ int paravirt_disable_iospace(void);
VEXTRA_CLOBBERS, \
pre, post, ##__VA_ARGS__)
-#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
+#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
+ ____PVOP_VCALL(op.func, CLBR_RET_REG, \
PVOP_VCALLEE_CLOBBERS, , \
pre, post, ##__VA_ARGS__)
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 7af14e512f9..e2c1668dde7 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end);
extern int kernel_map_sync_memtype(u64 base, unsigned long size,
unsigned long flag);
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+ unsigned long *type);
+
+void io_free_memtype(resource_size_t start, resource_size_t end);
+
#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 1ff685ca221..ada8c201d51 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void);
#else
#define pcibios_assign_all_busses() 0
#endif
-#define pcibios_scan_all_fns(a, b) 0
extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_IO 0x1000
@@ -144,7 +143,11 @@ static inline int __pcibus_to_node(const struct pci_bus *bus)
static inline const struct cpumask *
cpumask_of_pcibus(const struct pci_bus *bus)
{
- return cpumask_of_node(__pcibus_to_node(bus));
+ int node;
+
+ node = __pcibus_to_node(bus);
+ return (node == -1) ? cpu_online_mask :
+ cpumask_of_node(node);
}
#endif
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 04eacefcfd2..b65a36defeb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -168,15 +168,6 @@ do { \
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-void *pcpu_lpage_remapped(void *kaddr);
-#else
-static inline void *pcpu_lpage_remapped(void *kaddr)
-{
- return NULL;
-}
-#endif
-
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_event.h
index e7b7c938ae2..8d9f8548a87 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -1,8 +1,8 @@
-#ifndef _ASM_X86_PERF_COUNTER_H
-#define _ASM_X86_PERF_COUNTER_H
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
/*
- * Performance counter hw details:
+ * Performance event hw details:
*/
#define X86_PMC_MAX_GENERIC 8
@@ -28,9 +28,20 @@
*/
#define ARCH_PERFMON_EVENT_MASK 0xffff
+/*
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ * - inv
+ * - edge
+ * - cnt-mask
+ * The other filters are supported by fixed counters.
+ * The any-thread option is supported starting with v3.
+ */
+#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000
+
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
@@ -43,7 +54,7 @@
union cpuid10_eax {
struct {
unsigned int version_id:8;
- unsigned int num_counters:8;
+ unsigned int num_events:8;
unsigned int bit_width:8;
unsigned int mask_length:8;
} split;
@@ -52,7 +63,7 @@ union cpuid10_eax {
union cpuid10_edx {
struct {
- unsigned int num_counters_fixed:4;
+ unsigned int num_events_fixed:4;
unsigned int reserved:28;
} split;
unsigned int full;
@@ -60,7 +71,7 @@ union cpuid10_edx {
/*
- * Fixed-purpose performance counters:
+ * Fixed-purpose performance events:
*/
/*
@@ -87,22 +98,22 @@ union cpuid10_edx {
/*
* We model BTS tracing as another fixed-mode PMC.
*
- * We choose a value in the middle of the fixed counter range, since lower
- * values are used by actual fixed counters and higher values are used
+ * We choose a value in the middle of the fixed event range, since lower
+ * values are used by actual fixed events and higher values are used
* to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
*/
#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(void);
+#ifdef CONFIG_PERF_EVENTS
+extern void init_hw_perf_events(void);
+extern void perf_events_lapic_init(void);
-#define PERF_COUNTER_INDEX_OFFSET 0
+#define PERF_EVENT_INDEX_OFFSET 0
#else
-static inline void init_hw_perf_counters(void) { }
-static inline void perf_counters_lapic_init(void) { }
+static inline void init_hw_perf_events(void) { }
+static inline void perf_events_lapic_init(void) { }
#endif
-#endif /* _ASM_X86_PERF_COUNTER_H */
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 4c5b51fdc78..af6fd360ab3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -56,16 +56,6 @@ extern struct list_head pgd_list;
#define pte_update(mm, addr, ptep) do { } while (0)
#define pte_update_defer(mm, addr, ptep) do { } while (0)
-static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
-{
- native_pagetable_setup_start(base);
-}
-
-static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
-{
- native_pagetable_setup_done(base);
-}
-
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 54cb697f490..d1f4a760be2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -277,6 +277,7 @@ static inline pteval_t pte_flags(pte_t pte)
typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
+extern void set_nx(void);
extern int nx_enabled;
#define pgprot_writecombine pgprot_writecombine
@@ -299,8 +300,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
extern void native_pagetable_setup_start(pgd_t *base);
extern void native_pagetable_setup_done(pgd_t *base);
#else
-static inline void native_pagetable_setup_start(pgd_t *base) {}
-static inline void native_pagetable_setup_done(pgd_t *base) {}
+#define native_pagetable_setup_start x86_init_pgd_noop
+#define native_pagetable_setup_done x86_init_pgd_noop
#endif
struct seq_file;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e08ea043e08..6f8ec1c37e0 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,8 +27,10 @@ struct mm_struct;
#include <linux/cpumask.h>
#include <linux/cache.h>
#include <linux/threads.h>
+#include <linux/math64.h>
#include <linux/init.h>
+#define HBP_NUM 4
/*
* Default implementation of macro that returns current
* instruction pointer ("program counter").
@@ -421,6 +423,8 @@ extern unsigned int xstate_size;
extern void free_thread_xstate(struct task_struct *);
extern struct kmem_cache *task_xstate_cachep;
+struct perf_event;
+
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -442,13 +446,10 @@ struct thread_struct {
unsigned long fs;
#endif
unsigned long gs;
- /* Hardware debugging registers: */
- unsigned long debugreg0;
- unsigned long debugreg1;
- unsigned long debugreg2;
- unsigned long debugreg3;
- unsigned long debugreg6;
- unsigned long debugreg7;
+ /* Save middle states of ptrace breakpoints */
+ struct perf_event *ptrace_bps[HBP_NUM];
+ /* Debug status used for traps, single steps, etc... */
+ unsigned long debugreg6;
/* Fault info: */
unsigned long cr2;
unsigned long trap_no;
@@ -999,7 +1000,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
+extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
@@ -1020,4 +1021,35 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);
+extern int amd_get_nb_id(int cpu);
+
+struct aperfmperf {
+ u64 aperf, mperf;
+};
+
+static inline void get_aperfmperf(struct aperfmperf *am)
+{
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
+
+ rdmsrl(MSR_IA32_APERF, am->aperf);
+ rdmsrl(MSR_IA32_MPERF, am->mperf);
+}
+
+#define APERFMPERF_SHIFT 10
+
+static inline
+unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
+ struct aperfmperf *new)
+{
+ u64 aperf = new->aperf - old->aperf;
+ u64 mperf = new->mperf - old->mperf;
+ unsigned long ratio = aperf;
+
+ mperf >>= APERFMPERF_SHIFT;
+ if (mperf)
+ ratio = div64_u64(aperf, mperf);
+
+ return ratio;
+}
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908349a..3d11fd0f44c 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
#ifdef __KERNEL__
#include <asm/segment.h>
+#include <asm/page_types.h>
#endif
#ifndef __ASSEMBLY__
@@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
return regs->sp;
}
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs: pt_regs from which register value is gotten.
+ * @offset: offset number of the register.
+ *
+ * regs_get_register returns the value of a register. The @offset is the
+ * offset of the register in struct pt_regs address which specified by @regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+ unsigned int offset)
+{
+ if (unlikely(offset > MAX_REG_OFFSET))
+ return 0;
+ return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @addr: address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+ unsigned long addr)
+{
+ return ((addr & ~(THREAD_SIZE - 1)) ==
+ (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+ unsigned int n)
+{
+ unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return *addr;
+ else
+ return 0;
+}
+
+/* Get Nth argument at function call */
+extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
+ unsigned int n);
+
/*
* These are defined as per linux/ptrace.h, which see.
*/
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 4093d1ed6db..18e496c98ff 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -5,43 +5,6 @@
#define COMMAND_LINE_SIZE 2048
-#ifndef __ASSEMBLY__
-
-/*
- * Any setup quirks to be performed?
- */
-struct mpc_cpu;
-struct mpc_bus;
-struct mpc_oemtable;
-
-struct x86_quirks {
- int (*arch_pre_time_init)(void);
- int (*arch_time_init)(void);
- int (*arch_pre_intr_init)(void);
- int (*arch_intr_init)(void);
- int (*arch_trap_init)(void);
- char * (*arch_memory_setup)(void);
- int (*mach_get_smp_config)(unsigned int early);
- int (*mach_find_smp_config)(unsigned int reserve);
-
- int *mpc_record;
- int (*mpc_apic_id)(struct mpc_cpu *m);
- void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
- void (*mpc_oem_pci_bus)(struct mpc_bus *m);
- void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
- unsigned short oemsize);
- int (*setup_ioapic_ids)(void);
-};
-
-extern void x86_quirk_intr_init(void);
-
-extern void x86_quirk_trap_init(void);
-
-extern void x86_quirk_pre_time_init(void);
-extern void x86_quirk_time_init(void);
-
-#endif /* __ASSEMBLY__ */
-
#ifdef __i386__
#include <linux/pfn.h>
@@ -61,6 +24,7 @@ extern void x86_quirk_time_init(void);
#ifndef __ASSEMBLY__
#include <asm/bootparam.h>
+#include <asm/x86_init.h>
/* Interrupt control for vSMPowered x86_64 systems */
#ifdef CONFIG_X86_64
@@ -79,11 +43,16 @@ static inline void visws_early_detect(void) { }
static inline int is_visws_box(void) { return 0; }
#endif
-extern struct x86_quirks *x86_quirks;
extern unsigned long saved_video_mode;
-#ifndef CONFIG_PARAVIRT
-#define paravirt_post_allocator_init() do {} while (0)
+extern void reserve_standard_io_resources(void);
+extern void i386_reserve_resources(void);
+extern void setup_default_timer_irq(void);
+
+#ifdef CONFIG_X86_MRST
+extern void x86_mrst_early_setup(void);
+#else
+static inline void x86_mrst_early_setup(void) { }
#endif
#ifndef _SETUP
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 6a84ed166ae..1e796782cd7 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu)
smp_ops.send_call_func_single_ipi(cpu);
}
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
{
smp_ops.send_call_func_ipi(mask);
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index c86f452256d..3d3e8353ee5 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -65,7 +65,6 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
case 4:
*(int *)to = *(int *)from;
return to;
-
case 3:
*(short *)to = *(short *)from;
*((char *)to + 2) = *((char *)from + 2);
@@ -178,10 +177,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
*/
#ifndef CONFIG_KMEMCHECK
+
+#if (__GNUC__ >= 4)
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
+#else
#define memcpy(t, f, n) \
(__builtin_constant_p((n)) \
? __constant_memcpy((t), (f), (n)) \
: __memcpy((t), (f), (n)))
+#endif
#else
/*
* kmemcheck becomes very happy if we use the REP instructions unconditionally,
@@ -317,11 +321,15 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
: __memset_generic((s), (c), (count)))
#define __HAVE_ARCH_MEMSET
+#if (__GNUC__ >= 4)
+#define memset(s, c, count) __builtin_memset(s, c, count)
+#else
#define memset(s, c, count) \
(__builtin_constant_p(c) \
? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
(count)) \
: __memset((s), (c), (count)))
+#endif
/*
* find the first occurrence of byte 'c', or 1 past the area if none
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20174f..87ffcb12a1b 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,17 +3,14 @@
#include <linux/swiotlb.h>
-/* SWIOTLB interface */
-
-extern int swiotlb_force;
-
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
-extern void pci_swiotlb_init(void);
+extern int pci_swiotlb_init(void);
#else
#define swiotlb 0
-static inline void pci_swiotlb_init(void)
+static inline int pci_swiotlb_init(void)
{
+ return 0;
}
#endif
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d82f39bb790..8d33bc5462d 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -1,7 +1,7 @@
/*
* Access to user system call parameters and results
*
- * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -16,13 +16,13 @@
#include <linux/sched.h>
#include <linux/err.h>
-static inline long syscall_get_nr(struct task_struct *task,
- struct pt_regs *regs)
+/*
+ * Only the low 32 bits of orig_ax are meaningful, so we return int.
+ * This importantly ignores the high bits on 64-bit, so comparisons
+ * sign-extend the low 32 bits.
+ */
+static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
{
- /*
- * We always sign-extend a -1 value being set here,
- * so this is always either -1L or a syscall number.
- */
return regs->orig_ax;
}
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index f08f9737489..022a84386de 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -128,8 +128,6 @@ do { \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
"movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
"call __switch_to\n\t" \
- ".globl thread_return\n" \
- "thread_return:\n\t" \
"movq "__percpu_arg([current_task])",%%rsi\n\t" \
__switch_canary \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
@@ -157,19 +155,22 @@ extern void native_load_gs_index(unsigned);
* Load a segment. Fall back on loading the zero
* segment if something goes wrong..
*/
-#define loadsegment(seg, value) \
- asm volatile("\n" \
- "1:\t" \
- "movl %k0,%%" #seg "\n" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3:\t" \
- "movl %k1, %%" #seg "\n\t" \
- "jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b,3b) \
- : :"r" (value), "r" (0) : "memory")
-
+#define loadsegment(seg, value) \
+do { \
+ unsigned short __val = (value); \
+ \
+ asm volatile(" \n" \
+ "1: movl %k0,%%" #seg " \n" \
+ \
+ ".section .fixup,\"ax\" \n" \
+ "2: xorl %k0,%k0 \n" \
+ " jmp 1b \n" \
+ ".previous \n" \
+ \
+ _ASM_EXTABLE(1b, 2b) \
+ \
+ : "+r" (__val) : : "memory"); \
+} while (0)
/*
* Save a segment register away
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 50c733aac42..7bdec4e9b73 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -4,60 +4,7 @@
extern void hpet_time_init(void);
#include <asm/mc146818rtc.h>
-#ifdef CONFIG_X86_32
-#include <linux/efi.h>
-
-static inline unsigned long native_get_wallclock(void)
-{
- unsigned long retval;
-
- if (efi_enabled)
- retval = efi_get_time();
- else
- retval = mach_get_cmos_time();
-
- return retval;
-}
-
-static inline int native_set_wallclock(unsigned long nowtime)
-{
- int retval;
-
- if (efi_enabled)
- retval = efi_set_rtc_mmss(nowtime);
- else
- retval = mach_set_rtc_mmss(nowtime);
-
- return retval;
-}
-
-#else
-extern void native_time_init_hook(void);
-
-static inline unsigned long native_get_wallclock(void)
-{
- return mach_get_cmos_time();
-}
-
-static inline int native_set_wallclock(unsigned long nowtime)
-{
- return mach_set_rtc_mmss(nowtime);
-}
-
-#endif
extern void time_init(void);
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else /* !CONFIG_PARAVIRT */
-
-#define get_wallclock() native_get_wallclock()
-#define set_wallclock(x) native_set_wallclock(x)
-#define choose_time_init() hpet_time_init
-
-#endif /* CONFIG_PARAVIRT */
-
-extern unsigned long __init calibrate_cpu(void);
-
#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 20ca9c4d468..5469630b27f 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -8,20 +8,16 @@
#define TICK_SIZE (tick_nsec / 1000)
unsigned long long native_sched_clock(void);
-unsigned long native_calibrate_tsc(void);
+extern int recalibrate_cpu_khz(void);
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
extern int timer_ack;
-extern irqreturn_t timer_interrupt(int irq, void *dev_id);
-#endif /* CONFIG_X86_32 */
-extern int recalibrate_cpu_khz(void);
+#else
+# define timer_ack (0)
+#endif
extern int no_timer_check;
-#ifndef CONFIG_PARAVIRT
-#define calibrate_tsc() native_calibrate_tsc()
-#endif
-
/* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 26d06e052a1..40e37b10c6c 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,15 +116,11 @@ extern unsigned long node_remap_size[];
# define SD_CACHE_NICE_TRIES 1
# define SD_IDLE_IDX 1
-# define SD_NEWIDLE_IDX 2
-# define SD_FORKEXEC_IDX 0
#else
# define SD_CACHE_NICE_TRIES 2
# define SD_IDLE_IDX 2
-# define SD_NEWIDLE_IDX 2
-# define SD_FORKEXEC_IDX 1
#endif
@@ -137,22 +133,21 @@ extern unsigned long node_remap_size[];
.cache_nice_tries = SD_CACHE_NICE_TRIES, \
.busy_idx = 3, \
.idle_idx = SD_IDLE_IDX, \
- .newidle_idx = SD_NEWIDLE_IDX, \
- .wake_idx = 1, \
- .forkexec_idx = SD_FORKEXEC_IDX, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
\
.flags = 1*SD_LOAD_BALANCE \
| 1*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
- | 0*SD_WAKE_IDLE \
+ | 0*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
- | 1*SD_WAKE_BALANCE \
+ | 0*SD_PREFER_LOCAL \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 1*SD_SERIALIZE \
- | 1*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
@@ -171,21 +166,11 @@ static inline int numa_node_id(void)
return 0;
}
-static inline int cpu_to_node(int cpu)
-{
- return 0;
-}
-
static inline int early_cpu_to_node(int cpu)
{
return 0;
}
-static inline const struct cpumask *cpumask_of_node(int node)
-{
- return cpu_online_mask;
-}
-
static inline void setup_node_to_cpumask_map(void) { }
#endif
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 38ae163cc91..c0427295e8f 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void)
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
-int check_tsc_unstable(void);
+extern int check_tsc_unstable(void);
+extern unsigned long native_calibrate_tsc(void);
/*
* Boot-time check whether the TSCs are synchronized across
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d2c6c930b49..abd3e0ea762 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -570,7 +570,6 @@ extern struct movsl_mask {
#ifdef CONFIG_X86_32
# include "uaccess_32.h"
#else
-# define ARCH_HAS_SEARCH_EXTABLE
# include "uaccess_64.h"
#endif
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 5e06259e90e..0c9825e97f3 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -33,7 +33,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
* Copy data from kernel space to user space. Caller must check
* the specified block with access_ok() before calling this function.
* The caller should also make sure he pins the user space address
- * so that the we don't result in page fault and sleep.
+ * so that we don't result in page fault and sleep.
*
* Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
* we return the initial request size (1, 2 or 4), as copy_*_user should do.
@@ -187,9 +187,34 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
unsigned long __must_check copy_to_user(void __user *to,
const void *from, unsigned long n);
-unsigned long __must_check copy_from_user(void *to,
+unsigned long __must_check _copy_from_user(void *to,
const void __user *from,
unsigned long n);
+
+
+extern void copy_from_user_overflow(void)
+#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
+ __compiletime_error("copy_from_user() buffer size is not provably correct")
+#else
+ __compiletime_warning("copy_from_user() buffer size is not provably correct")
+#endif
+;
+
+static inline unsigned long __must_check copy_from_user(void *to,
+ const void __user *from,
+ unsigned long n)
+{
+ int sz = __compiletime_object_size(to);
+ int ret = -EFAULT;
+
+ if (likely(sz == -1 || sz >= n))
+ ret = _copy_from_user(to, from, n);
+ else
+ copy_from_user_overflow();
+
+ return ret;
+}
+
long __must_check strncpy_from_user(char *dst, const char __user *src,
long count);
long __must_check __strncpy_from_user(char *dst,
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index db24b215fc5..46324c6a4f6 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -19,12 +19,37 @@ __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len);
__must_check unsigned long
-copy_to_user(void __user *to, const void *from, unsigned len);
+_copy_to_user(void __user *to, const void *from, unsigned len);
__must_check unsigned long
-copy_from_user(void *to, const void __user *from, unsigned len);
+_copy_from_user(void *to, const void __user *from, unsigned len);
__must_check unsigned long
copy_in_user(void __user *to, const void __user *from, unsigned len);
+static inline unsigned long __must_check copy_from_user(void *to,
+ const void __user *from,
+ unsigned long n)
+{
+ int sz = __compiletime_object_size(to);
+ int ret = -EFAULT;
+
+ might_fault();
+ if (likely(sz == -1 || sz >= n))
+ ret = _copy_from_user(to, from, n);
+#ifdef CONFIG_DEBUG_VM
+ else
+ WARN(1, "Buffer overflow detected!\n");
+#endif
+ return ret;
+}
+
+static __always_inline __must_check
+int copy_to_user(void __user *dst, const void *src, unsigned size)
+{
+ might_fault();
+
+ return _copy_to_user(dst, src, size);
+}
+
static __always_inline __must_check
int __copy_from_user(void *dst, const void __user *src, unsigned size)
{
@@ -176,8 +201,11 @@ __must_check long strlen_user(const char __user *str);
__must_check unsigned long clear_user(void __user *mem, unsigned long len);
__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
-__must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
- unsigned size);
+static __must_check __always_inline int
+__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
+{
+ return copy_user_generic(dst, (__force const void *)src, size);
+}
static __must_check __always_inline int
__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 8deaada61bc..6fb3c209a7e 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,7 +341,7 @@
#define __NR_preadv 333
#define __NR_pwritev 334
#define __NR_rt_tgsigqueueinfo 335
-#define __NR_perf_counter_open 336
+#define __NR_perf_event_open 336
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index b9f3c60de5f..8d3ad0adbc6 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
__SYSCALL(__NR_pwritev, sys_pwritev)
#define __NR_rt_tgsigqueueinfo 297
__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_counter_open 298
-__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_perf_event_open 298
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 77a68505419..d1414af9855 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -15,9 +15,12 @@
#include <linux/numa.h>
#include <linux/percpu.h>
#include <linux/timer.h>
+#include <linux/io.h>
#include <asm/types.h>
#include <asm/percpu.h>
#include <asm/uv/uv_mmrs.h>
+#include <asm/irq_vectors.h>
+#include <asm/io_apic.h>
/*
@@ -113,7 +116,7 @@
/*
* The largest possible NASID of a C or M brick (+ 2)
*/
-#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
+#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2)
struct uv_scir_s {
struct timer_list timer;
@@ -229,6 +232,20 @@ static inline unsigned long uv_gpa(void *v)
return uv_soc_phys_ram_to_gpa(__pa(v));
}
+/* gnode -> pnode */
+static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
+{
+ return gpa >> uv_hub_info->m_val;
+}
+
+/* gpa -> pnode */
+static inline int uv_gpa_to_pnode(unsigned long gpa)
+{
+ unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1;
+
+ return uv_gpa_to_gnode(gpa) & n_mask;
+}
+
/* pnode, offset --> socket virtual */
static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
{
@@ -258,13 +275,13 @@ static inline unsigned long *uv_global_mmr32_address(int pnode,
static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
unsigned long val)
{
- *uv_global_mmr32_address(pnode, offset) = val;
+ writeq(val, uv_global_mmr32_address(pnode, offset));
}
static inline unsigned long uv_read_global_mmr32(int pnode,
unsigned long offset)
{
- return *uv_global_mmr32_address(pnode, offset);
+ return readq(uv_global_mmr32_address(pnode, offset));
}
/*
@@ -281,13 +298,13 @@ static inline unsigned long *uv_global_mmr64_address(int pnode,
static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
unsigned long val)
{
- *uv_global_mmr64_address(pnode, offset) = val;
+ writeq(val, uv_global_mmr64_address(pnode, offset));
}
static inline unsigned long uv_read_global_mmr64(int pnode,
unsigned long offset)
{
- return *uv_global_mmr64_address(pnode, offset);
+ return readq(uv_global_mmr64_address(pnode, offset));
}
/*
@@ -301,22 +318,22 @@ static inline unsigned long *uv_local_mmr_address(unsigned long offset)
static inline unsigned long uv_read_local_mmr(unsigned long offset)
{
- return *uv_local_mmr_address(offset);
+ return readq(uv_local_mmr_address(offset));
}
static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
{
- *uv_local_mmr_address(offset) = val;
+ writeq(val, uv_local_mmr_address(offset));
}
static inline unsigned char uv_read_local_mmr8(unsigned long offset)
{
- return *((unsigned char *)uv_local_mmr_address(offset));
+ return readb(uv_local_mmr_address(offset));
}
static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
{
- *((unsigned char *)uv_local_mmr_address(offset)) = val;
+ writeb(val, uv_local_mmr_address(offset));
}
/*
@@ -420,9 +437,14 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
{
unsigned long val;
+ unsigned long dmode = dest_Fixed;
+
+ if (vector == NMI_VECTOR)
+ dmode = dest_NMI;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
- ((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) |
+ ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
+ (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
(vector << UVH_IPI_INT_VECTOR_SHFT);
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
index 9613c8c0b64..d6b17c76062 100644
--- a/arch/x86/include/asm/uv/uv_irq.h
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -25,12 +25,14 @@ struct uv_IO_APIC_route_entry {
dest : 32;
};
-extern struct irq_chip uv_irq_chip;
-
-extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long);
-extern void arch_disable_uv_irq(int, unsigned long);
+enum {
+ UV_AFFINITY_ALL,
+ UV_AFFINITY_NODE,
+ UV_AFFINITY_CPU
+};
-extern int uv_setup_irq(char *, int, int, unsigned long);
-extern void uv_teardown_irq(unsigned int, int, unsigned long);
+extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
+extern int uv_setup_irq(char *, int, int, unsigned long, int);
+extern void uv_teardown_irq(unsigned int);
#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index dc27a69e5d2..3d61e204826 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -21,6 +21,7 @@ struct vsyscall_gtod_data {
u32 shift;
} clock;
struct timespec wall_to_monotonic;
+ struct timespec wall_time_coarse;
};
extern struct vsyscall_gtod_data __vsyscall_gtod_data
__section_vsyscall_gtod_data;
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
index c11b7e100d8..e49ed6d2fd4 100644
--- a/arch/x86/include/asm/vmware.h
+++ b/arch/x86/include/asm/vmware.h
@@ -20,7 +20,7 @@
#ifndef ASM_X86__VMWARE_H
#define ASM_X86__VMWARE_H
-extern unsigned long vmware_get_tsc_khz(void);
+extern void vmware_platform_setup(void);
extern int vmware_platform(void);
extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 11be5ad2e0e..272514c2d45 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -55,6 +55,7 @@
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -351,9 +352,16 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_EPT_EXTENT_CONTEXT 1
#define VMX_EPT_EXTENT_GLOBAL 2
+
+#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
+#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPTP_UC_BIT (1ull << 8)
+#define VMX_EPTP_WB_BIT (1ull << 14)
+#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+
#define VMX_EPT_DEFAULT_GAW 3
#define VMX_EPT_MAX_GAW 0x4
#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
new file mode 100644
index 00000000000..d8e71459f02
--- /dev/null
+++ b/arch/x86/include/asm/x86_init.h
@@ -0,0 +1,143 @@
+#ifndef _ASM_X86_PLATFORM_H
+#define _ASM_X86_PLATFORM_H
+
+#include <asm/pgtable_types.h>
+#include <asm/bootparam.h>
+
+struct mpc_bus;
+struct mpc_cpu;
+struct mpc_table;
+
+/**
+ * struct x86_init_mpparse - platform specific mpparse ops
+ * @mpc_record: platform specific mpc record accounting
+ * @setup_ioapic_ids: platform specific ioapic id override
+ * @mpc_apic_id: platform specific mpc apic id assignment
+ * @smp_read_mpc_oem: platform specific oem mpc table setup
+ * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL)
+ * @mpc_oem_bus_info: platform specific mpc bus info
+ * @find_smp_config: find the smp configuration
+ * @get_smp_config: get the smp configuration
+ */
+struct x86_init_mpparse {
+ void (*mpc_record)(unsigned int mode);
+ void (*setup_ioapic_ids)(void);
+ int (*mpc_apic_id)(struct mpc_cpu *m);
+ void (*smp_read_mpc_oem)(struct mpc_table *mpc);
+ void (*mpc_oem_pci_bus)(struct mpc_bus *m);
+ void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
+ void (*find_smp_config)(unsigned int reserve);
+ void (*get_smp_config)(unsigned int early);
+};
+
+/**
+ * struct x86_init_resources - platform specific resource related ops
+ * @probe_roms: probe BIOS roms
+ * @reserve_resources: reserve the standard resources for the
+ * platform
+ * @memory_setup: platform specific memory setup
+ *
+ */
+struct x86_init_resources {
+ void (*probe_roms)(void);
+ void (*reserve_resources)(void);
+ char *(*memory_setup)(void);
+};
+
+/**
+ * struct x86_init_irqs - platform specific interrupt setup
+ * @pre_vector_init: init code to run before interrupt vectors
+ * are set up.
+ * @intr_init: interrupt init code
+ * @trap_init: platform specific trap setup
+ */
+struct x86_init_irqs {
+ void (*pre_vector_init)(void);
+ void (*intr_init)(void);
+ void (*trap_init)(void);
+};
+
+/**
+ * struct x86_init_oem - oem platform specific customizing functions
+ * @arch_setup: platform specific architecure setup
+ * @banner: print a platform specific banner
+ */
+struct x86_init_oem {
+ void (*arch_setup)(void);
+ void (*banner)(void);
+};
+
+/**
+ * struct x86_init_paging - platform specific paging functions
+ * @pagetable_setup_start: platform specific pre paging_init() call
+ * @pagetable_setup_done: platform specific post paging_init() call
+ */
+struct x86_init_paging {
+ void (*pagetable_setup_start)(pgd_t *base);
+ void (*pagetable_setup_done)(pgd_t *base);
+};
+
+/**
+ * struct x86_init_timers - platform specific timer setup
+ * @setup_perpcu_clockev: set up the per cpu clock event device for the
+ * boot cpu
+ * @tsc_pre_init: platform function called before TSC init
+ * @timer_init: initialize the platform timer (default PIT/HPET)
+ */
+struct x86_init_timers {
+ void (*setup_percpu_clockev)(void);
+ void (*tsc_pre_init)(void);
+ void (*timer_init)(void);
+};
+
+/**
+ * struct x86_init_iommu - platform specific iommu setup
+ * @iommu_init: platform specific iommu setup
+ */
+struct x86_init_iommu {
+ int (*iommu_init)(void);
+};
+
+/**
+ * struct x86_init_ops - functions for platform specific setup
+ *
+ */
+struct x86_init_ops {
+ struct x86_init_resources resources;
+ struct x86_init_mpparse mpparse;
+ struct x86_init_irqs irqs;
+ struct x86_init_oem oem;
+ struct x86_init_paging paging;
+ struct x86_init_timers timers;
+ struct x86_init_iommu iommu;
+};
+
+/**
+ * struct x86_cpuinit_ops - platform specific cpu hotplug setups
+ * @setup_percpu_clockev: set up the per cpu clock event device
+ */
+struct x86_cpuinit_ops {
+ void (*setup_percpu_clockev)(void);
+};
+
+/**
+ * struct x86_platform_ops - platform specific runtime functions
+ * @calibrate_tsc: calibrate TSC
+ * @get_wallclock: get time from HW clock like RTC etc.
+ * @set_wallclock: set time back to HW clock
+ */
+struct x86_platform_ops {
+ unsigned long (*calibrate_tsc)(void);
+ unsigned long (*get_wallclock)(void);
+ int (*set_wallclock)(unsigned long nowtime);
+ void (*iommu_shutdown)(void);
+};
+
+extern struct x86_init_ops x86_init;
+extern struct x86_cpuinit_ops x86_cpuinit;
+extern struct x86_platform_ops x86_platform;
+
+extern void x86_init_noop(void);
+extern void x86_init_uint_noop(unsigned int unused);
+
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7..4f2e66e29ec 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y += setup.o i8259.o irqinit.o
+obj-y += time.o ioport.o ldt.o dumpstack.o
+obj-y += setup.o x86_init.o i8259.o irqinit.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y += alternative.o i8253.o pci-nommu.o
+obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
+obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
+obj-$(CONFIG_SFI) += sfi.o
obj-y += reboot.o
obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o
scx200-y += scx200_32.o
obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_X86_MRST) += mrst.o
microcode-y := microcode_core.o
microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8c44c232efc..59cdfa4686b 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
* P4, Core and beyond CPUs
*/
if (c->x86_vendor == X86_VENDOR_INTEL &&
- (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)))
+ (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14)))
flags->bm_control = 0;
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index d296f4a195c..d85d1b2432b 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
struct cpuinfo_x86 *c = &cpu_data(pr->id);
pr->pdc = NULL;
- if (c->x86_vendor == X86_VENDOR_INTEL)
+ if (c->x86_vendor == X86_VENDOR_INTEL ||
+ c->x86_vendor == X86_VENDOR_CENTAUR)
init_intel_pdc(pr, c);
return;
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cd..060fff8f5c5 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -57,5 +57,8 @@ SECTIONS
*(.note*)
}
+ /*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
. = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
}
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 98f230f6a28..32fb09102a1 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -28,6 +28,7 @@
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/amd_iommu_proto.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
@@ -56,20 +57,115 @@ struct iommu_cmd {
u32 data[4];
};
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
- struct unity_map_entry *e);
-static struct dma_ops_domain *find_protection_domain(u16 devid);
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address, int end_lvl,
- u64 **pte_page, gfp_t gfp);
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages);
static void reset_iommu_command_buffer(struct amd_iommu *iommu);
-static u64 *fetch_pte(struct protection_domain *domain,
- unsigned long address, int map_size);
static void update_domain(struct protection_domain *domain);
+/****************************************************************************
+ *
+ * Helper functions
+ *
+ ****************************************************************************/
+
+static inline u16 get_device_id(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ return calc_devid(pdev->bus->number, pdev->devfn);
+}
+
+static struct iommu_dev_data *get_dev_data(struct device *dev)
+{
+ return dev->archdata.iommu;
+}
+
+/*
+ * In this function the list of preallocated protection domains is traversed to
+ * find the domain for a specific device
+ */
+static struct dma_ops_domain *find_protection_domain(u16 devid)
+{
+ struct dma_ops_domain *entry, *ret = NULL;
+ unsigned long flags;
+ u16 alias = amd_iommu_alias_table[devid];
+
+ if (list_empty(&iommu_pd_list))
+ return NULL;
+
+ spin_lock_irqsave(&iommu_pd_list_lock, flags);
+
+ list_for_each_entry(entry, &iommu_pd_list, list) {
+ if (entry->target_dev == devid ||
+ entry->target_dev == alias) {
+ ret = entry;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+ return ret;
+}
+
+/*
+ * This function checks if the driver got a valid device from the caller to
+ * avoid dereferencing invalid pointers.
+ */
+static bool check_device(struct device *dev)
+{
+ u16 devid;
+
+ if (!dev || !dev->dma_mask)
+ return false;
+
+ /* No device or no PCI device */
+ if (!dev || dev->bus != &pci_bus_type)
+ return false;
+
+ devid = get_device_id(dev);
+
+ /* Out of our scope? */
+ if (devid > amd_iommu_last_bdf)
+ return false;
+
+ if (amd_iommu_rlookup_table[devid] == NULL)
+ return false;
+
+ return true;
+}
+
+static int iommu_init_device(struct device *dev)
+{
+ struct iommu_dev_data *dev_data;
+ struct pci_dev *pdev;
+ u16 devid, alias;
+
+ if (dev->archdata.iommu)
+ return 0;
+
+ dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ dev_data->dev = dev;
+
+ devid = get_device_id(dev);
+ alias = amd_iommu_alias_table[devid];
+ pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
+ if (pdev)
+ dev_data->alias = &pdev->dev;
+
+ atomic_set(&dev_data->bind, 0);
+
+ dev->archdata.iommu = dev_data;
+
+
+ return 0;
+}
+
+static void iommu_uninit_device(struct device *dev)
+{
+ kfree(dev->archdata.iommu);
+}
#ifdef CONFIG_AMD_IOMMU_STATS
/*
@@ -90,7 +186,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
DECLARE_STATS_COUNTER(total_map_requests);
static struct dentry *stats_dir;
-static struct dentry *de_isolate;
static struct dentry *de_fflush;
static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +203,6 @@ static void amd_iommu_stats_init(void)
if (stats_dir == NULL)
return;
- de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
- (u32 *)&amd_iommu_isolate);
-
de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
(u32 *)&amd_iommu_unmap_flush);
@@ -130,12 +222,6 @@ static void amd_iommu_stats_init(void)
#endif
-/* returns !0 if the IOMMU is caching non-present entries in its TLB */
-static int iommu_has_npcache(struct amd_iommu *iommu)
-{
- return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
-}
-
/****************************************************************************
*
* Interrupt handling functions
@@ -199,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
break;
case EVENT_TYPE_ILL_CMD:
printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+ iommu->reset_in_progress = true;
reset_iommu_command_buffer(iommu);
dump_command(address);
break;
@@ -321,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
- if (unlikely(i == EXIT_LOOP_COUNT)) {
- spin_unlock(&iommu->lock);
- reset_iommu_command_buffer(iommu);
- spin_lock(&iommu->lock);
- }
+ if (unlikely(i == EXIT_LOOP_COUNT))
+ iommu->reset_in_progress = true;
}
/*
@@ -372,26 +456,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
out:
spin_unlock_irqrestore(&iommu->lock, flags);
+ if (iommu->reset_in_progress)
+ reset_iommu_command_buffer(iommu);
+
return 0;
}
+static void iommu_flush_complete(struct protection_domain *domain)
+{
+ int i;
+
+ for (i = 0; i < amd_iommus_present; ++i) {
+ if (!domain->dev_iommu[i])
+ continue;
+
+ /*
+ * Devices of this domain are behind this IOMMU
+ * We need to wait for completion of all commands.
+ */
+ iommu_completion_wait(amd_iommus[i]);
+ }
+}
+
/*
* Command send function for invalidating a device table entry
*/
-static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
+static int iommu_flush_device(struct device *dev)
{
+ struct amd_iommu *iommu;
struct iommu_cmd cmd;
- int ret;
+ u16 devid;
- BUG_ON(iommu == NULL);
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
+ /* Build command */
memset(&cmd, 0, sizeof(cmd));
CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
cmd.data[0] = devid;
- ret = iommu_queue_command(iommu, &cmd);
-
- return ret;
+ return iommu_queue_command(iommu, &cmd);
}
static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +534,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
* It invalidates a single PTE if the range to flush is within a single
* page. Otherwise it flushes the whole TLB of the IOMMU.
*/
-static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
- u64 address, size_t size)
+static void __iommu_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size, int pde)
{
- int s = 0;
- unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
+ int s = 0, i;
+ unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
address &= PAGE_MASK;
@@ -447,142 +551,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
s = 1;
}
- iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
- return 0;
+ for (i = 0; i < amd_iommus_present; ++i) {
+ if (!domain->dev_iommu[i])
+ continue;
+
+ /*
+ * Devices of this domain are behind this IOMMU
+ * We need a TLB flush
+ */
+ iommu_queue_inv_iommu_pages(amd_iommus[i], address,
+ domain->id, pde, s);
+ }
+
+ return;
}
-/* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
+static void iommu_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size)
{
- u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-
- INC_STATS_COUNTER(domain_flush_single);
+ __iommu_flush_pages(domain, address, size, 0);
+}
- iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
+/* Flush the whole IO/TLB for a given protection domain */
+static void iommu_flush_tlb(struct protection_domain *domain)
+{
+ __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
}
/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
+static void iommu_flush_tlb_pde(struct protection_domain *domain)
{
- u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-
- INC_STATS_COUNTER(domain_flush_single);
-
- iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
+ __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
}
+
/*
- * This function flushes one domain on one IOMMU
+ * This function flushes the DTEs for all devices in domain
*/
-static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
+static void iommu_flush_domain_devices(struct protection_domain *domain)
{
- struct iommu_cmd cmd;
+ struct iommu_dev_data *dev_data;
unsigned long flags;
- __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- domid, 1, 1);
+ spin_lock_irqsave(&domain->lock, flags);
- spin_lock_irqsave(&iommu->lock, flags);
- __iommu_queue_command(iommu, &cmd);
- __iommu_completion_wait(iommu);
- __iommu_wait_for_completion(iommu);
- spin_unlock_irqrestore(&iommu->lock, flags);
+ list_for_each_entry(dev_data, &domain->dev_list, list)
+ iommu_flush_device(dev_data->dev);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
}
-static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
+static void iommu_flush_all_domain_devices(void)
{
- int i;
+ struct protection_domain *domain;
+ unsigned long flags;
- for (i = 1; i < MAX_DOMAIN_ID; ++i) {
- if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
- continue;
- flush_domain_on_iommu(iommu, i);
+ spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+
+ list_for_each_entry(domain, &amd_iommu_pd_list, list) {
+ iommu_flush_domain_devices(domain);
+ iommu_flush_complete(domain);
}
+ spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+ iommu_flush_all_domain_devices();
}
/*
- * This function is used to flush the IO/TLB for a given protection domain
- * on every IOMMU in the system
+ * This function uses heavy locking and may disable irqs for some time. But
+ * this is no issue because it is only called during resume.
*/
-static void iommu_flush_domain(u16 domid)
+void amd_iommu_flush_all_domains(void)
{
- struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ unsigned long flags;
- INC_STATS_COUNTER(domain_flush_all);
+ spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- for_each_iommu(iommu)
- flush_domain_on_iommu(iommu, domid);
+ list_for_each_entry(domain, &amd_iommu_pd_list, list) {
+ spin_lock(&domain->lock);
+ iommu_flush_tlb_pde(domain);
+ iommu_flush_complete(domain);
+ spin_unlock(&domain->lock);
+ }
+
+ spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
}
-void amd_iommu_flush_all_domains(void)
+static void reset_iommu_command_buffer(struct amd_iommu *iommu)
{
- struct amd_iommu *iommu;
+ pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
- for_each_iommu(iommu)
- flush_all_domains_on_iommu(iommu);
+ if (iommu->reset_in_progress)
+ panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+
+ amd_iommu_reset_cmd_buffer(iommu);
+ amd_iommu_flush_all_devices();
+ amd_iommu_flush_all_domains();
+
+ iommu->reset_in_progress = false;
}
-static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
+/****************************************************************************
+ *
+ * The functions below are used the create the page table mappings for
+ * unity mapped regions.
+ *
+ ****************************************************************************/
+
+/*
+ * This function is used to add another level to an IO page table. Adding
+ * another level increases the size of the address space by 9 bits to a size up
+ * to 64 bits.
+ */
+static bool increase_address_space(struct protection_domain *domain,
+ gfp_t gfp)
{
- int i;
+ u64 *pte;
- for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if (iommu != amd_iommu_rlookup_table[i])
- continue;
+ if (domain->mode == PAGE_MODE_6_LEVEL)
+ /* address space already 64 bit large */
+ return false;
- iommu_queue_inv_dev_entry(iommu, i);
- iommu_completion_wait(iommu);
- }
+ pte = (void *)get_zeroed_page(gfp);
+ if (!pte)
+ return false;
+
+ *pte = PM_LEVEL_PDE(domain->mode,
+ virt_to_phys(domain->pt_root));
+ domain->pt_root = pte;
+ domain->mode += 1;
+ domain->updated = true;
+
+ return true;
}
-static void flush_devices_by_domain(struct protection_domain *domain)
+static u64 *alloc_pte(struct protection_domain *domain,
+ unsigned long address,
+ int end_lvl,
+ u64 **pte_page,
+ gfp_t gfp)
{
- struct amd_iommu *iommu;
- int i;
+ u64 *pte, *page;
+ int level;
- for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
- (amd_iommu_pd_table[i] != domain))
- continue;
+ while (address > PM_LEVEL_SIZE(domain->mode))
+ increase_address_space(domain, gfp);
- iommu = amd_iommu_rlookup_table[i];
- if (!iommu)
- continue;
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+
+ while (level > end_lvl) {
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
+ }
- iommu_queue_inv_dev_entry(iommu, i);
- iommu_completion_wait(iommu);
+ level -= 1;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+
+ if (pte_page && level == end_lvl)
+ *pte_page = pte;
+
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
}
+
+ return pte;
}
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64 *fetch_pte(struct protection_domain *domain,
+ unsigned long address, int map_size)
{
- pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
+ int level;
+ u64 *pte;
- if (iommu->reset_in_progress)
- panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- iommu->reset_in_progress = true;
+ while (level > map_size) {
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
- amd_iommu_reset_cmd_buffer(iommu);
- flush_all_devices_for_iommu(iommu);
- flush_all_domains_on_iommu(iommu);
+ level -= 1;
- iommu->reset_in_progress = false;
-}
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
-void amd_iommu_flush_all_devices(void)
-{
- flush_devices_by_domain(NULL);
-}
+ if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
+ pte = NULL;
+ break;
+ }
+ }
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
+ return pte;
+}
/*
* Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +828,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
}
/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
- struct unity_map_entry *entry;
- int ret;
-
- list_for_each_entry(entry, &amd_iommu_unity_map, list) {
- if (!iommu_for_unity_map(iommu, entry))
- continue;
- ret = dma_ops_unity_map(iommu->default_dom, entry);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
* This function actually applies the mapping to the page table of the
* dma_ops domain.
*/
@@ -704,6 +856,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
}
/*
+ * Init the unity mappings for a specific IOMMU in the system
+ *
+ * Basically iterates over all unity mapping entries and applies them to
+ * the default domain DMA of that IOMMU if necessary.
+ */
+static int iommu_init_unity_mappings(struct amd_iommu *iommu)
+{
+ struct unity_map_entry *entry;
+ int ret;
+
+ list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+ if (!iommu_for_unity_map(iommu, entry))
+ continue;
+ ret = dma_ops_unity_map(iommu->default_dom, entry);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
* Inits the unity mappings required for a specific device
*/
static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +914,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
*/
/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
+ * Used to reserve address ranges in the aperture (e.g. for exclusion
+ * ranges.
*/
-static u64 *fetch_pte(struct protection_domain *domain,
- unsigned long address, int map_size)
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+ unsigned long start_page,
+ unsigned int pages)
{
- int level;
- u64 *pte;
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
- while (level > map_size) {
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- level -= 1;
+ unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
+ if (start_page + pages > last_page)
+ pages = last_page - start_page;
- if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
- pte = NULL;
- break;
- }
+ for (i = start_page; i < start_page + pages; ++i) {
+ int index = i / APERTURE_RANGE_PAGES;
+ int page = i % APERTURE_RANGE_PAGES;
+ __set_bit(page, dom->aperture[index]->bitmap);
}
-
- return pte;
}
/*
@@ -775,11 +938,11 @@ static u64 *fetch_pte(struct protection_domain *domain,
* aperture in case of dma_ops domain allocation or address allocation
* failure.
*/
-static int alloc_new_range(struct amd_iommu *iommu,
- struct dma_ops_domain *dma_dom,
+static int alloc_new_range(struct dma_ops_domain *dma_dom,
bool populate, gfp_t gfp)
{
int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+ struct amd_iommu *iommu;
int i;
#ifdef CONFIG_IOMMU_STRESS
@@ -819,14 +982,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
dma_dom->aperture_size += APERTURE_RANGE_SIZE;
/* Intialize the exclusion range if necessary */
- if (iommu->exclusion_start &&
- iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
- iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ for_each_iommu(iommu) {
+ if (iommu->exclusion_start &&
+ iommu->exclusion_start >= dma_dom->aperture[index]->offset
+ && iommu->exclusion_start < dma_dom->aperture_size) {
+ unsigned long startpage;
+ int pages = iommu_num_pages(iommu->exclusion_start,
+ iommu->exclusion_length,
+ PAGE_SIZE);
+ startpage = iommu->exclusion_start >> PAGE_SHIFT;
+ dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ }
}
/*
@@ -928,7 +1094,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
}
if (unlikely(address == -1))
- address = bad_dma_address;
+ address = DMA_ERROR_CODE;
WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -973,6 +1139,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
*
****************************************************************************/
+/*
+ * This function adds a protection domain to the global protection domain list
+ */
+static void add_domain_to_list(struct protection_domain *domain)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+ list_add(&domain->list, &amd_iommu_pd_list);
+ spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
+}
+
+/*
+ * This function removes a protection domain to the global
+ * protection domain list
+ */
+static void del_domain_from_list(struct protection_domain *domain)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+ list_del(&domain->list);
+ spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
+}
+
static u16 domain_id_alloc(void)
{
unsigned long flags;
@@ -1000,26 +1191,6 @@ static void domain_id_free(int id)
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages)
-{
- unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
- if (start_page + pages > last_page)
- pages = last_page - start_page;
-
- for (i = start_page; i < start_page + pages; ++i) {
- int index = i / APERTURE_RANGE_PAGES;
- int page = i % APERTURE_RANGE_PAGES;
- __set_bit(page, dom->aperture[index]->bitmap);
- }
-}
-
static void free_pagetable(struct protection_domain *domain)
{
int i, j;
@@ -1061,6 +1232,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
if (!dom)
return;
+ del_domain_from_list(&dom->domain);
+
free_pagetable(&dom->domain);
for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1251,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
* It also intializes the page table and the address allocator data
* structures required for the dma_ops interface
*/
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
+static struct dma_ops_domain *dma_ops_domain_alloc(void)
{
struct dma_ops_domain *dma_dom;
@@ -1091,6 +1264,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
dma_dom->domain.id = domain_id_alloc();
if (dma_dom->domain.id == 0)
goto free_dma_dom;
+ INIT_LIST_HEAD(&dma_dom->domain.dev_list);
dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1275,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
dma_dom->need_flush = false;
dma_dom->target_dev = 0xffff;
- if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
+ add_domain_to_list(&dma_dom->domain);
+
+ if (alloc_new_range(dma_dom, true, GFP_KERNEL))
goto free_dma_dom;
/*
@@ -1129,22 +1305,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
return domain->flags & PD_DMA_OPS_MASK;
}
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(u16 devid)
-{
- struct protection_domain *dom;
- unsigned long flags;
-
- read_lock_irqsave(&amd_iommu_devtable_lock, flags);
- dom = amd_iommu_pd_table[devid];
- read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return dom;
-}
-
static void set_dte_entry(u16 devid, struct protection_domain *domain)
{
u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1316,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
amd_iommu_dev_table[devid].data[2] = domain->id;
amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+}
+
+static void clear_dte_entry(u16 devid)
+{
+ /* remove entry from the device table seen by the hardware */
+ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
+ amd_iommu_dev_table[devid].data[1] = 0;
+ amd_iommu_dev_table[devid].data[2] = 0;
- amd_iommu_pd_table[devid] = domain;
+ amd_iommu_apply_erratum_63(devid);
+}
+
+static void do_attach(struct device *dev, struct protection_domain *domain)
+{
+ struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu;
+ u16 devid;
+
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
+ dev_data = get_dev_data(dev);
+
+ /* Update data structures */
+ dev_data->domain = domain;
+ list_add(&dev_data->list, &domain->dev_list);
+ set_dte_entry(devid, domain);
+
+ /* Do reference counting */
+ domain->dev_iommu[iommu->index] += 1;
+ domain->dev_cnt += 1;
+
+ /* Flush the DTE entry */
+ iommu_flush_device(dev);
+}
+
+static void do_detach(struct device *dev)
+{
+ struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu;
+ u16 devid;
+
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
+ dev_data = get_dev_data(dev);
+
+ /* decrease reference counters */
+ dev_data->domain->dev_iommu[iommu->index] -= 1;
+ dev_data->domain->dev_cnt -= 1;
+
+ /* Update data structures */
+ dev_data->domain = NULL;
+ list_del(&dev_data->list);
+ clear_dte_entry(devid);
+
+ /* Flush the DTE entry */
+ iommu_flush_device(dev);
}
/*
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
*/
-static void __attach_device(struct amd_iommu *iommu,
- struct protection_domain *domain,
- u16 devid)
+static int __attach_device(struct device *dev,
+ struct protection_domain *domain)
{
+ struct iommu_dev_data *dev_data, *alias_data;
+
+ dev_data = get_dev_data(dev);
+ alias_data = get_dev_data(dev_data->alias);
+
+ if (!alias_data)
+ return -EINVAL;
+
/* lock domain */
spin_lock(&domain->lock);
- /* update DTE entry */
- set_dte_entry(devid, domain);
+ /* Some sanity checks */
+ if (alias_data->domain != NULL &&
+ alias_data->domain != domain)
+ return -EBUSY;
+
+ if (dev_data->domain != NULL &&
+ dev_data->domain != domain)
+ return -EBUSY;
- domain->dev_cnt += 1;
+ /* Do real assignment */
+ if (dev_data->alias != dev) {
+ alias_data = get_dev_data(dev_data->alias);
+ if (alias_data->domain == NULL)
+ do_attach(dev_data->alias, domain);
+
+ atomic_inc(&alias_data->bind);
+ }
+
+ if (dev_data->domain == NULL)
+ do_attach(dev, domain);
+
+ atomic_inc(&dev_data->bind);
/* ready */
spin_unlock(&domain->lock);
+
+ return 0;
}
/*
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
*/
-static void attach_device(struct amd_iommu *iommu,
- struct protection_domain *domain,
- u16 devid)
+static int attach_device(struct device *dev,
+ struct protection_domain *domain)
{
unsigned long flags;
+ int ret;
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __attach_device(iommu, domain, devid);
+ ret = __attach_device(dev, domain);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
/*
@@ -1199,96 +1440,125 @@ static void attach_device(struct amd_iommu *iommu,
* left the caches in the IOMMU dirty. So we have to flush
* here to evict all dirty stuff.
*/
- iommu_queue_inv_dev_entry(iommu, devid);
- iommu_flush_tlb_pde(iommu, domain->id);
+ iommu_flush_tlb_pde(domain);
+
+ return ret;
}
/*
* Removes a device from a protection domain (unlocked)
*/
-static void __detach_device(struct protection_domain *domain, u16 devid)
+static void __detach_device(struct device *dev)
{
+ struct iommu_dev_data *dev_data = get_dev_data(dev);
+ struct iommu_dev_data *alias_data;
+ unsigned long flags;
- /* lock domain */
- spin_lock(&domain->lock);
+ BUG_ON(!dev_data->domain);
- /* remove domain from the lookup table */
- amd_iommu_pd_table[devid] = NULL;
+ spin_lock_irqsave(&dev_data->domain->lock, flags);
- /* remove entry from the device table seen by the hardware */
- amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
- amd_iommu_dev_table[devid].data[1] = 0;
- amd_iommu_dev_table[devid].data[2] = 0;
+ if (dev_data->alias != dev) {
+ alias_data = get_dev_data(dev_data->alias);
+ if (atomic_dec_and_test(&alias_data->bind))
+ do_detach(dev_data->alias);
+ }
- /* decrease reference counter */
- domain->dev_cnt -= 1;
+ if (atomic_dec_and_test(&dev_data->bind))
+ do_detach(dev);
- /* ready */
- spin_unlock(&domain->lock);
+ spin_unlock_irqrestore(&dev_data->domain->lock, flags);
/*
* If we run in passthrough mode the device must be assigned to the
* passthrough domain if it is detached from any other domain
*/
- if (iommu_pass_through) {
- struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
- __attach_device(iommu, pt_domain, devid);
- }
+ if (iommu_pass_through && dev_data->domain == NULL)
+ __attach_device(dev, pt_domain);
}
/*
* Removes a device from a protection domain (with devtable_lock held)
*/
-static void detach_device(struct protection_domain *domain, u16 devid)
+static void detach_device(struct device *dev)
{
unsigned long flags;
/* lock device table */
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __detach_device(domain, devid);
+ __detach_device(dev);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}
+/*
+ * Find out the protection domain structure for a given PCI device. This
+ * will give us the pointer to the page table root for example.
+ */
+static struct protection_domain *domain_for_device(struct device *dev)
+{
+ struct protection_domain *dom;
+ struct iommu_dev_data *dev_data, *alias_data;
+ unsigned long flags;
+ u16 devid, alias;
+
+ devid = get_device_id(dev);
+ alias = amd_iommu_alias_table[devid];
+ dev_data = get_dev_data(dev);
+ alias_data = get_dev_data(dev_data->alias);
+ if (!alias_data)
+ return NULL;
+
+ read_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ dom = dev_data->domain;
+ if (dom == NULL &&
+ alias_data->domain != NULL) {
+ __attach_device(dev, alias_data->domain);
+ dom = alias_data->domain;
+ }
+
+ read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ return dom;
+}
+
static int device_change_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
struct device *dev = data;
- struct pci_dev *pdev = to_pci_dev(dev);
- u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
+ u16 devid;
struct protection_domain *domain;
struct dma_ops_domain *dma_domain;
struct amd_iommu *iommu;
unsigned long flags;
- if (devid > amd_iommu_last_bdf)
- goto out;
-
- devid = amd_iommu_alias_table[devid];
-
- iommu = amd_iommu_rlookup_table[devid];
- if (iommu == NULL)
- goto out;
-
- domain = domain_for_device(devid);
+ if (!check_device(dev))
+ return 0;
- if (domain && !dma_ops_domain(domain))
- WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
- "to a non-dma-ops domain\n", dev_name(dev));
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
switch (action) {
case BUS_NOTIFY_UNBOUND_DRIVER:
+
+ domain = domain_for_device(dev);
+
if (!domain)
goto out;
if (iommu_pass_through)
break;
- detach_device(domain, devid);
+ detach_device(dev);
break;
case BUS_NOTIFY_ADD_DEVICE:
+
+ iommu_init_device(dev);
+
+ domain = domain_for_device(dev);
+
/* allocate a protection domain if a device is added */
dma_domain = find_protection_domain(devid);
if (dma_domain)
goto out;
- dma_domain = dma_ops_domain_alloc(iommu);
+ dma_domain = dma_ops_domain_alloc();
if (!dma_domain)
goto out;
dma_domain->target_dev = devid;
@@ -1298,11 +1568,15 @@ static int device_change_notifier(struct notifier_block *nb,
spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
break;
+ case BUS_NOTIFY_DEL_DEVICE:
+
+ iommu_uninit_device(dev);
+
default:
goto out;
}
- iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_flush_device(dev);
iommu_completion_wait(iommu);
out:
@@ -1320,106 +1594,46 @@ static struct notifier_block device_nb = {
*****************************************************************************/
/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
- if (!dev || !dev->dma_mask)
- return false;
-
- return true;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
- struct dma_ops_domain *entry, *ret = NULL;
- unsigned long flags;
-
- if (list_empty(&iommu_pd_list))
- return NULL;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
- list_for_each_entry(entry, &iommu_pd_list, list) {
- if (entry->target_dev == devid) {
- ret = entry;
- break;
- }
- }
-
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- return ret;
-}
-
-/*
* In the dma_ops path we only have the struct device. This function
* finds the corresponding IOMMU, the protection domain and the
* requestor id for a given device.
* If the device is not yet associated with a domain this is also done
* in this function.
*/
-static int get_device_resources(struct device *dev,
- struct amd_iommu **iommu,
- struct protection_domain **domain,
- u16 *bdf)
+static struct protection_domain *get_domain(struct device *dev)
{
+ struct protection_domain *domain;
struct dma_ops_domain *dma_dom;
- struct pci_dev *pcidev;
- u16 _bdf;
-
- *iommu = NULL;
- *domain = NULL;
- *bdf = 0xffff;
-
- if (dev->bus != &pci_bus_type)
- return 0;
+ u16 devid = get_device_id(dev);
- pcidev = to_pci_dev(dev);
- _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
-
- /* device not translated by any IOMMU in the system? */
- if (_bdf > amd_iommu_last_bdf)
- return 0;
+ if (!check_device(dev))
+ return ERR_PTR(-EINVAL);
- *bdf = amd_iommu_alias_table[_bdf];
+ domain = domain_for_device(dev);
+ if (domain != NULL && !dma_ops_domain(domain))
+ return ERR_PTR(-EBUSY);
- *iommu = amd_iommu_rlookup_table[*bdf];
- if (*iommu == NULL)
- return 0;
- *domain = domain_for_device(*bdf);
- if (*domain == NULL) {
- dma_dom = find_protection_domain(*bdf);
- if (!dma_dom)
- dma_dom = (*iommu)->default_dom;
- *domain = &dma_dom->domain;
- attach_device(*iommu, *domain, *bdf);
- DUMP_printk("Using protection domain %d for device %s\n",
- (*domain)->id, dev_name(dev));
- }
+ if (domain != NULL)
+ return domain;
- if (domain_for_device(_bdf) == NULL)
- attach_device(*iommu, *domain, _bdf);
+ /* Device not bount yet - bind it */
+ dma_dom = find_protection_domain(devid);
+ if (!dma_dom)
+ dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
+ attach_device(dev, &dma_dom->domain);
+ DUMP_printk("Using protection domain %d for device %s\n",
+ dma_dom->domain.id, dev_name(dev));
- return 1;
+ return &dma_dom->domain;
}
static void update_device_table(struct protection_domain *domain)
{
- unsigned long flags;
- int i;
+ struct iommu_dev_data *dev_data;
- for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if (amd_iommu_pd_table[i] != domain)
- continue;
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- set_dte_entry(i, domain);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+ list_for_each_entry(dev_data, &domain->dev_list, list) {
+ u16 devid = get_device_id(dev_data->dev);
+ set_dte_entry(devid, domain);
}
}
@@ -1429,76 +1643,13 @@ static void update_domain(struct protection_domain *domain)
return;
update_device_table(domain);
- flush_devices_by_domain(domain);
- iommu_flush_domain(domain->id);
+ iommu_flush_domain_devices(domain);
+ iommu_flush_tlb_pde(domain);
domain->updated = false;
}
/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
- gfp_t gfp)
-{
- u64 *pte;
-
- if (domain->mode == PAGE_MODE_6_LEVEL)
- /* address space already 64 bit large */
- return false;
-
- pte = (void *)get_zeroed_page(gfp);
- if (!pte)
- return false;
-
- *pte = PM_LEVEL_PDE(domain->mode,
- virt_to_phys(domain->pt_root));
- domain->pt_root = pte;
- domain->mode += 1;
- domain->updated = true;
-
- return true;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address,
- int end_lvl,
- u64 **pte_page,
- gfp_t gfp)
-{
- u64 *pte, *page;
- int level;
-
- while (address > PM_LEVEL_SIZE(domain->mode))
- increase_address_space(domain, gfp);
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
- while (level > end_lvl) {
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
- }
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(*pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
* This function fetches the PTE for a given address in the aperture
*/
static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1528,8 +1679,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
* This is the generic map function. It maps one 4kb page at paddr to
* the given address in the DMA address space for the domain.
*/
-static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
- struct dma_ops_domain *dom,
+static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
unsigned long address,
phys_addr_t paddr,
int direction)
@@ -1542,7 +1692,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
pte = dma_ops_get_pte(dom, address);
if (!pte)
- return bad_dma_address;
+ return DMA_ERROR_CODE;
__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
@@ -1563,8 +1713,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
/*
* The generic unmapping function for on page in the DMA address space.
*/
-static void dma_ops_domain_unmap(struct amd_iommu *iommu,
- struct dma_ops_domain *dom,
+static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
unsigned long address)
{
struct aperture_range *aperture;
@@ -1595,7 +1744,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
* Must be called with the domain lock held.
*/
static dma_addr_t __map_single(struct device *dev,
- struct amd_iommu *iommu,
struct dma_ops_domain *dma_dom,
phys_addr_t paddr,
size_t size,
@@ -1623,7 +1771,7 @@ static dma_addr_t __map_single(struct device *dev,
retry:
address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
dma_mask);
- if (unlikely(address == bad_dma_address)) {
+ if (unlikely(address == DMA_ERROR_CODE)) {
/*
* setting next_address here will let the address
* allocator only scan the new allocated range in the
@@ -1631,7 +1779,7 @@ retry:
*/
dma_dom->next_address = dma_dom->aperture_size;
- if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+ if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
goto out;
/*
@@ -1643,8 +1791,8 @@ retry:
start = address;
for (i = 0; i < pages; ++i) {
- ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
- if (ret == bad_dma_address)
+ ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
+ if (ret == DMA_ERROR_CODE)
goto out_unmap;
paddr += PAGE_SIZE;
@@ -1655,10 +1803,10 @@ retry:
ADD_STATS_COUNTER(alloced_io_mem, size);
if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
- iommu_flush_tlb(iommu, dma_dom->domain.id);
+ iommu_flush_tlb(&dma_dom->domain);
dma_dom->need_flush = false;
- } else if (unlikely(iommu_has_npcache(iommu)))
- iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
+ } else if (unlikely(amd_iommu_np_cache))
+ iommu_flush_pages(&dma_dom->domain, address, size);
out:
return address;
@@ -1667,20 +1815,19 @@ out_unmap:
for (--i; i >= 0; --i) {
start -= PAGE_SIZE;
- dma_ops_domain_unmap(iommu, dma_dom, start);
+ dma_ops_domain_unmap(dma_dom, start);
}
dma_ops_free_addresses(dma_dom, address, pages);
- return bad_dma_address;
+ return DMA_ERROR_CODE;
}
/*
* Does the reverse of the __map_single function. Must be called with
* the domain lock held too
*/
-static void __unmap_single(struct amd_iommu *iommu,
- struct dma_ops_domain *dma_dom,
+static void __unmap_single(struct dma_ops_domain *dma_dom,
dma_addr_t dma_addr,
size_t size,
int dir)
@@ -1688,7 +1835,7 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_addr_t i, start;
unsigned int pages;
- if ((dma_addr == bad_dma_address) ||
+ if ((dma_addr == DMA_ERROR_CODE) ||
(dma_addr + size > dma_dom->aperture_size))
return;
@@ -1697,7 +1844,7 @@ static void __unmap_single(struct amd_iommu *iommu,
start = dma_addr;
for (i = 0; i < pages; ++i) {
- dma_ops_domain_unmap(iommu, dma_dom, start);
+ dma_ops_domain_unmap(dma_dom, start);
start += PAGE_SIZE;
}
@@ -1706,7 +1853,7 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_ops_free_addresses(dma_dom, dma_addr, pages);
if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
+ iommu_flush_pages(&dma_dom->domain, dma_addr, size);
dma_dom->need_flush = false;
}
}
@@ -1720,36 +1867,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
struct dma_attrs *attrs)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
dma_addr_t addr;
u64 dma_mask;
phys_addr_t paddr = page_to_phys(page) + offset;
INC_STATS_COUNTER(cnt_map_single);
- if (!check_device(dev))
- return bad_dma_address;
-
- dma_mask = *dev->dma_mask;
-
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (iommu == NULL || domain == NULL)
- /* device not handled by any AMD IOMMU */
+ domain = get_domain(dev);
+ if (PTR_ERR(domain) == -EINVAL)
return (dma_addr_t)paddr;
+ else if (IS_ERR(domain))
+ return DMA_ERROR_CODE;
- if (!dma_ops_domain(domain))
- return bad_dma_address;
+ dma_mask = *dev->dma_mask;
spin_lock_irqsave(&domain->lock, flags);
- addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
+
+ addr = __map_single(dev, domain->priv, paddr, size, dir, false,
dma_mask);
- if (addr == bad_dma_address)
+ if (addr == DMA_ERROR_CODE)
goto out;
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1764,25 +1904,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
enum dma_data_direction dir, struct dma_attrs *attrs)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
INC_STATS_COUNTER(cnt_unmap_single);
- if (!check_device(dev) ||
- !get_device_resources(dev, &iommu, &domain, &devid))
- /* device not handled by any AMD IOMMU */
- return;
-
- if (!dma_ops_domain(domain))
+ domain = get_domain(dev);
+ if (IS_ERR(domain))
return;
spin_lock_irqsave(&domain->lock, flags);
- __unmap_single(iommu, domain->priv, dma_addr, size, dir);
+ __unmap_single(domain->priv, dma_addr, size, dir);
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1814,9 +1948,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
struct dma_attrs *attrs)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
int i;
struct scatterlist *s;
phys_addr_t paddr;
@@ -1825,25 +1957,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
INC_STATS_COUNTER(cnt_map_sg);
- if (!check_device(dev))
+ domain = get_domain(dev);
+ if (PTR_ERR(domain) == -EINVAL)
+ return map_sg_no_iommu(dev, sglist, nelems, dir);
+ else if (IS_ERR(domain))
return 0;
dma_mask = *dev->dma_mask;
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (!iommu || !domain)
- return map_sg_no_iommu(dev, sglist, nelems, dir);
-
- if (!dma_ops_domain(domain))
- return 0;
-
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
paddr = sg_phys(s);
- s->dma_address = __map_single(dev, iommu, domain->priv,
+ s->dma_address = __map_single(dev, domain->priv,
paddr, s->length, dir, false,
dma_mask);
@@ -1854,7 +1981,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
goto unmap;
}
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1863,7 +1990,7 @@ out:
unmap:
for_each_sg(sglist, s, mapped_elems, i) {
if (s->dma_address)
- __unmap_single(iommu, domain->priv, s->dma_address,
+ __unmap_single(domain->priv, s->dma_address,
s->dma_length, dir);
s->dma_address = s->dma_length = 0;
}
@@ -1882,30 +2009,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
struct dma_attrs *attrs)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
struct scatterlist *s;
- u16 devid;
int i;
INC_STATS_COUNTER(cnt_unmap_sg);
- if (!check_device(dev) ||
- !get_device_resources(dev, &iommu, &domain, &devid))
- return;
-
- if (!dma_ops_domain(domain))
+ domain = get_domain(dev);
+ if (IS_ERR(domain))
return;
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
- __unmap_single(iommu, domain->priv, s->dma_address,
+ __unmap_single(domain->priv, s->dma_address,
s->dma_length, dir);
s->dma_address = s->dma_length = 0;
}
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1918,49 +2040,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
{
unsigned long flags;
void *virt_addr;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
phys_addr_t paddr;
u64 dma_mask = dev->coherent_dma_mask;
INC_STATS_COUNTER(cnt_alloc_coherent);
- if (!check_device(dev))
+ domain = get_domain(dev);
+ if (PTR_ERR(domain) == -EINVAL) {
+ virt_addr = (void *)__get_free_pages(flag, get_order(size));
+ *dma_addr = __pa(virt_addr);
+ return virt_addr;
+ } else if (IS_ERR(domain))
return NULL;
- if (!get_device_resources(dev, &iommu, &domain, &devid))
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+ dma_mask = dev->coherent_dma_mask;
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+ flag |= __GFP_ZERO;
- flag |= __GFP_ZERO;
virt_addr = (void *)__get_free_pages(flag, get_order(size));
if (!virt_addr)
return NULL;
paddr = virt_to_phys(virt_addr);
- if (!iommu || !domain) {
- *dma_addr = (dma_addr_t)paddr;
- return virt_addr;
- }
-
- if (!dma_ops_domain(domain))
- goto out_free;
-
if (!dma_mask)
dma_mask = *dev->dma_mask;
spin_lock_irqsave(&domain->lock, flags);
- *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+ *dma_addr = __map_single(dev, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address) {
+ if (*dma_addr == DMA_ERROR_CODE) {
spin_unlock_irqrestore(&domain->lock, flags);
goto out_free;
}
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1980,28 +2097,19 @@ static void free_coherent(struct device *dev, size_t size,
void *virt_addr, dma_addr_t dma_addr)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
INC_STATS_COUNTER(cnt_free_coherent);
- if (!check_device(dev))
- return;
-
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (!iommu || !domain)
- goto free_mem;
-
- if (!dma_ops_domain(domain))
+ domain = get_domain(dev);
+ if (IS_ERR(domain))
goto free_mem;
spin_lock_irqsave(&domain->lock, flags);
- __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
+ __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -2015,22 +2123,7 @@ free_mem:
*/
static int amd_iommu_dma_supported(struct device *dev, u64 mask)
{
- u16 bdf;
- struct pci_dev *pcidev;
-
- /* No device or no PCI device */
- if (!dev || dev->bus != &pci_bus_type)
- return 0;
-
- pcidev = to_pci_dev(dev);
-
- bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
-
- /* Out of our scope? */
- if (bdf > amd_iommu_last_bdf)
- return 0;
-
- return 1;
+ return check_device(dev);
}
/*
@@ -2044,25 +2137,30 @@ static void prealloc_protection_domains(void)
{
struct pci_dev *dev = NULL;
struct dma_ops_domain *dma_dom;
- struct amd_iommu *iommu;
u16 devid;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
- devid = calc_devid(dev->bus->number, dev->devfn);
- if (devid > amd_iommu_last_bdf)
- continue;
- devid = amd_iommu_alias_table[devid];
- if (domain_for_device(devid))
+
+ /* Do we handle this device? */
+ if (!check_device(&dev->dev))
continue;
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
+
+ iommu_init_device(&dev->dev);
+
+ /* Is there already any domain for it? */
+ if (domain_for_device(&dev->dev))
continue;
- dma_dom = dma_ops_domain_alloc(iommu);
+
+ devid = get_device_id(&dev->dev);
+
+ dma_dom = dma_ops_domain_alloc();
if (!dma_dom)
continue;
init_unity_mappings_for_device(dma_dom, devid);
dma_dom->target_dev = devid;
+ attach_device(&dev->dev, &dma_dom->domain);
+
list_add_tail(&dma_dom->list, &iommu_pd_list);
}
}
@@ -2091,7 +2189,7 @@ int __init amd_iommu_init_dma_ops(void)
* protection domain will be assigned to the default one.
*/
for_each_iommu(iommu) {
- iommu->default_dom = dma_ops_domain_alloc(iommu);
+ iommu->default_dom = dma_ops_domain_alloc();
if (iommu->default_dom == NULL)
return -ENOMEM;
iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2101,15 +2199,12 @@ int __init amd_iommu_init_dma_ops(void)
}
/*
- * If device isolation is enabled, pre-allocate the protection
- * domains for each device.
+ * Pre-allocate the protection domains for each device.
*/
- if (amd_iommu_isolate)
- prealloc_protection_domains();
+ prealloc_protection_domains();
iommu_detected = 1;
- force_iommu = 1;
- bad_dma_address = 0;
+ swiotlb = 0;
#ifdef CONFIG_GART_IOMMU
gart_iommu_aperture_disabled = 1;
gart_iommu_aperture = 0;
@@ -2148,14 +2243,17 @@ free_domains:
static void cleanup_domain(struct protection_domain *domain)
{
+ struct iommu_dev_data *dev_data, *next;
unsigned long flags;
- u16 devid;
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
- if (amd_iommu_pd_table[devid] == domain)
- __detach_device(domain, devid);
+ list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
+ struct device *dev = dev_data->dev;
+
+ do_detach(dev);
+ atomic_set(&dev_data->bind, 0);
+ }
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}
@@ -2165,6 +2263,8 @@ static void protection_domain_free(struct protection_domain *domain)
if (!domain)
return;
+ del_domain_from_list(domain);
+
if (domain->id)
domain_id_free(domain->id);
@@ -2183,6 +2283,9 @@ static struct protection_domain *protection_domain_alloc(void)
domain->id = domain_id_alloc();
if (!domain->id)
goto out_err;
+ INIT_LIST_HEAD(&domain->dev_list);
+
+ add_domain_to_list(domain);
return domain;
@@ -2239,26 +2342,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
static void amd_iommu_detach_device(struct iommu_domain *dom,
struct device *dev)
{
- struct protection_domain *domain = dom->priv;
+ struct iommu_dev_data *dev_data = dev->archdata.iommu;
struct amd_iommu *iommu;
- struct pci_dev *pdev;
u16 devid;
- if (dev->bus != &pci_bus_type)
+ if (!check_device(dev))
return;
- pdev = to_pci_dev(dev);
-
- devid = calc_devid(pdev->bus->number, pdev->devfn);
+ devid = get_device_id(dev);
- if (devid > 0)
- detach_device(domain, devid);
+ if (dev_data->domain != NULL)
+ detach_device(dev);
iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
return;
- iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_flush_device(dev);
iommu_completion_wait(iommu);
}
@@ -2266,35 +2366,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
struct device *dev)
{
struct protection_domain *domain = dom->priv;
- struct protection_domain *old_domain;
+ struct iommu_dev_data *dev_data;
struct amd_iommu *iommu;
- struct pci_dev *pdev;
+ int ret;
u16 devid;
- if (dev->bus != &pci_bus_type)
+ if (!check_device(dev))
return -EINVAL;
- pdev = to_pci_dev(dev);
+ dev_data = dev->archdata.iommu;
- devid = calc_devid(pdev->bus->number, pdev->devfn);
-
- if (devid >= amd_iommu_last_bdf ||
- devid != amd_iommu_alias_table[devid])
- return -EINVAL;
+ devid = get_device_id(dev);
iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
return -EINVAL;
- old_domain = domain_for_device(devid);
- if (old_domain)
- detach_device(old_domain, devid);
+ if (dev_data->domain)
+ detach_device(dev);
- attach_device(iommu, domain, devid);
+ ret = attach_device(dev, domain);
iommu_completion_wait(iommu);
- return 0;
+ return ret;
}
static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2340,7 +2435,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
iova += PAGE_SIZE;
}
- iommu_flush_domain(domain->id);
+ iommu_flush_tlb_pde(domain);
}
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2391,8 +2486,9 @@ static struct iommu_ops amd_iommu_ops = {
int __init amd_iommu_init_passthrough(void)
{
+ struct amd_iommu *iommu;
struct pci_dev *dev = NULL;
- u16 devid, devid2;
+ u16 devid;
/* allocate passthroug domain */
pt_domain = protection_domain_alloc();
@@ -2402,20 +2498,17 @@ int __init amd_iommu_init_passthrough(void)
pt_domain->mode |= PAGE_MODE_NONE;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
- struct amd_iommu *iommu;
- devid = calc_devid(dev->bus->number, dev->devfn);
- if (devid > amd_iommu_last_bdf)
+ if (!check_device(&dev->dev))
continue;
- devid2 = amd_iommu_alias_table[devid];
+ devid = get_device_id(&dev->dev);
- iommu = amd_iommu_rlookup_table[devid2];
+ iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
continue;
- __attach_device(iommu, pt_domain, devid);
- __attach_device(iommu, pt_domain, devid2);
+ attach_device(&dev->dev, pt_domain);
}
pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b4b61d462dc..7ffc3996523 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -25,10 +25,12 @@
#include <linux/interrupt.h>
#include <linux/msi.h>
#include <asm/pci-direct.h>
+#include <asm/amd_iommu_proto.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/x86_init.h>
/*
* definitions for the ACPI scanning code
@@ -123,18 +125,24 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
-#ifdef CONFIG_IOMMU_STRESS
-bool amd_iommu_isolate = false;
-#else
-bool amd_iommu_isolate = true; /* if true, device isolation is
- enabled */
-#endif
-
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
system */
+/* Array to assign indices to IOMMUs*/
+struct amd_iommu *amd_iommus[MAX_IOMMUS];
+int amd_iommus_present;
+
+/* IOMMUs have a non-present cache? */
+bool amd_iommu_np_cache __read_mostly;
+
+/*
+ * List of protection domains - used during resume
+ */
+LIST_HEAD(amd_iommu_pd_list);
+spinlock_t amd_iommu_pd_lock;
+
/*
* Pointer to the device table which is shared by all AMD IOMMUs
* it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +165,6 @@ u16 *amd_iommu_alias_table;
struct amd_iommu **amd_iommu_rlookup_table;
/*
- * The pd table (protection domain table) is used to find the protection domain
- * data structure a device belongs to. Indexed with the PCI device id too.
- */
-struct protection_domain **amd_iommu_pd_table;
-
-/*
* AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
* to know which ones are already in use.
*/
@@ -240,7 +242,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
}
-static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
{
u32 ctrl;
@@ -519,6 +521,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
}
+static int get_dev_entry_bit(u16 devid, u8 bit)
+{
+ int i = (bit >> 5) & 0x07;
+ int _bit = bit & 0x1f;
+
+ return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
+}
+
+
+void amd_iommu_apply_erratum_63(u16 devid)
+{
+ int sysmgt;
+
+ sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
+ (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
+
+ if (sysmgt == 0x01)
+ set_dev_entry_bit(devid, DEV_ENTRY_IW);
+}
+
/* Writes the specific IOMMU for a device into the rlookup table */
static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
{
@@ -547,6 +569,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
if (flags & ACPI_DEVFLAG_LINT1)
set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+ amd_iommu_apply_erratum_63(devid);
+
set_iommu_for_device(iommu, devid);
}
@@ -816,7 +840,18 @@ static void __init free_iommu_all(void)
static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
{
spin_lock_init(&iommu->lock);
+
+ /* Add IOMMU to internal data structures */
list_add_tail(&iommu->list, &amd_iommu_list);
+ iommu->index = amd_iommus_present++;
+
+ if (unlikely(iommu->index >= MAX_IOMMUS)) {
+ WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
+ return -ENOSYS;
+ }
+
+ /* Index is fine - add IOMMU to the array */
+ amd_iommus[iommu->index] = iommu;
/*
* Copy data from ACPI table entry to the iommu struct
@@ -846,6 +881,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
init_iommu_from_acpi(iommu, h);
init_iommu_devices(iommu);
+ if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
+ amd_iommu_np_cache = true;
+
return pci_enable_device(iommu->dev);
}
@@ -903,7 +941,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
*
****************************************************************************/
-static int __init iommu_setup_msi(struct amd_iommu *iommu)
+static int iommu_setup_msi(struct amd_iommu *iommu)
{
int r;
@@ -1154,19 +1192,10 @@ static struct sys_device device_amd_iommu = {
* functions. Finally it prints some information about AMD IOMMUs and
* the driver state and enables the hardware.
*/
-int __init amd_iommu_init(void)
+static int __init amd_iommu_init(void)
{
int i, ret = 0;
-
- if (no_iommu) {
- printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
- return 0;
- }
-
- if (!amd_iommu_detected)
- return -ENODEV;
-
/*
* First parse ACPI tables to find the largest Bus/Dev/Func
* we need to handle. Upon this information the shared data
@@ -1203,15 +1232,6 @@ int __init amd_iommu_init(void)
if (amd_iommu_rlookup_table == NULL)
goto free;
- /*
- * Protection Domain table - maps devices to protection domains
- * This table has the same size as the rlookup_table
- */
- amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(rlookup_table_size));
- if (amd_iommu_pd_table == NULL)
- goto free;
-
amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
GFP_KERNEL | __GFP_ZERO,
get_order(MAX_DOMAIN_ID/8));
@@ -1233,6 +1253,8 @@ int __init amd_iommu_init(void)
*/
amd_iommu_pd_alloc_bitmap[0] = 1;
+ spin_lock_init(&amd_iommu_pd_lock);
+
/*
* now the data structures are allocated and basically initialized
* start the real acpi table scan
@@ -1264,17 +1286,12 @@ int __init amd_iommu_init(void)
if (iommu_pass_through)
goto out;
- printk(KERN_INFO "AMD-Vi: device isolation ");
- if (amd_iommu_isolate)
- printk("enabled\n");
- else
- printk("disabled\n");
-
if (amd_iommu_unmap_flush)
printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
else
printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
+ x86_platform.iommu_shutdown = disable_iommus;
out:
return ret;
@@ -1282,9 +1299,6 @@ free:
free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
get_order(MAX_DOMAIN_ID/8));
- free_pages((unsigned long)amd_iommu_pd_table,
- get_order(rlookup_table_size));
-
free_pages((unsigned long)amd_iommu_rlookup_table,
get_order(rlookup_table_size));
@@ -1301,11 +1315,6 @@ free:
goto out;
}
-void amd_iommu_shutdown(void)
-{
- disable_iommus();
-}
-
/****************************************************************************
*
* Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1320,16 +1329,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
void __init amd_iommu_detect(void)
{
- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
+ if (no_iommu || (iommu_detected && !gart_iommu_aperture))
return;
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
iommu_detected = 1;
amd_iommu_detected = 1;
-#ifdef CONFIG_GART_IOMMU
- gart_iommu_aperture_disabled = 1;
- gart_iommu_aperture = 0;
-#endif
+ x86_init.iommu.iommu_init = amd_iommu_init;
}
}
@@ -1350,10 +1356,6 @@ static int __init parse_amd_iommu_dump(char *str)
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
- if (strncmp(str, "isolate", 7) == 0)
- amd_iommu_isolate = true;
- if (strncmp(str, "share", 5) == 0)
- amd_iommu_isolate = false;
if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe..e0dfb6856aa 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/k8.h>
+#include <asm/x86_init.h>
int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
@@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void)
iommu_detected = 1;
gart_iommu_aperture = 1;
+ x86_init.iommu.iommu_init = gart_iommu_init;
aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
@@ -456,7 +458,7 @@ out:
if (aper_alloc) {
/* Got the aperture from the AGP bridge */
- } else if (swiotlb && !valid_agp) {
+ } else if (!valid_agp) {
/* Do nothing */
} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
force_iommu ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd..565c1bfc507 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 159740decc4..ad8c75b9e45 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,7 +14,7 @@
* Mikael Pettersson : PM converted to driver model.
*/
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
@@ -35,7 +35,8 @@
#include <linux/smp.h>
#include <linux/mm.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
+#include <asm/x86_init.h>
#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/mpspec.h>
@@ -61,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U;
/*
* The highest APIC ID seen during enumeration.
*
- * This determines the messaging protocol we can use: if all APIC IDs
+ * On AMD, this determines the messaging protocol we can use: if all APIC IDs
* are in the 0 ... 7 range, then we can use logical addressing which
* has some performance advantages (better broadcasting).
*
@@ -240,28 +241,13 @@ static int modern_apic(void)
}
/*
- * bare function to substitute write operation
- * and it's _that_ fast :)
- */
-static void native_apic_write_dummy(u32 reg, u32 v)
-{
- WARN_ON_ONCE((cpu_has_apic || !disable_apic));
-}
-
-static u32 native_apic_read_dummy(u32 reg)
-{
- WARN_ON_ONCE((cpu_has_apic && !disable_apic));
- return 0;
-}
-
-/*
- * right after this call apic->write/read doesn't do anything
- * note that there is no restore operation it works one way
+ * right after this call apic become NOOP driven
+ * so apic->write/read doesn't do anything
*/
void apic_disable(void)
{
- apic->read = native_apic_read_dummy;
- apic->write = native_apic_write_dummy;
+ pr_info("APIC: switched to apic NOOP\n");
+ apic = &apic_noop;
}
void native_apic_wait_icr_idle(void)
@@ -458,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
v = apic_read(APIC_LVTT);
v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write(APIC_LVTT, v);
- apic_write(APIC_TMICT, 0xffffffff);
+ apic_write(APIC_TMICT, 0);
break;
case CLOCK_EVT_MODE_RESUME:
/* Nothing to do here */
@@ -978,7 +964,7 @@ void lapic_shutdown(void)
{
unsigned long flags;
- if (!cpu_has_apic)
+ if (!cpu_has_apic && !apic_from_smp_config())
return;
local_irq_save(flags);
@@ -1188,7 +1174,7 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
- perf_counters_lapic_init();
+ perf_events_lapic_init();
preempt_disable();
@@ -1196,8 +1182,7 @@ void __cpuinit setup_local_APIC(void)
* Double-check whether this APIC is really registered.
* This is meaningless in clustered apic mode, so we skip it.
*/
- if (!apic->apic_id_registered())
- BUG();
+ BUG_ON(!apic->apic_id_registered());
/*
* Intel recommends to set DFR, LDR and TPR before enabling
@@ -1392,14 +1377,11 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
struct IO_APIC_route_entry **ioapic_entries = NULL;
int ret, x2apic_enabled = 0;
- int dmar_table_init_ret = 0;
+ int dmar_table_init_ret;
-#ifdef CONFIG_INTR_REMAP
dmar_table_init_ret = dmar_table_init();
- if (dmar_table_init_ret)
- pr_debug("dmar_table_init() failed with %d:\n",
- dmar_table_init_ret);
-#endif
+ if (dmar_table_init_ret && !x2apic_supported())
+ return;
ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
@@ -1709,7 +1691,7 @@ int __init APIC_init_uniprocessor(void)
localise_nmi_watchdog();
#endif
- setup_boot_clock();
+ x86_init.timers.setup_percpu_clockev();
#ifdef CONFIG_X86_64
check_nmi_watchdog();
#endif
@@ -1916,24 +1898,14 @@ void __cpuinit generic_processor_info(int apicid, int version)
max_physical_apicid = apicid;
#ifdef CONFIG_X86_32
- /*
- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
- * but we need to work other dependencies like SMP_SUSPEND etc
- * before this can be done without some confusion.
- * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
- * - Ashok Raj <ashok.raj@intel.com>
- */
- if (max_physical_apicid >= 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_AMD:
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (num_processors > 8)
+ def_to_bigsmp = 1;
+ break;
+ case X86_VENDOR_AMD:
+ if (max_physical_apicid >= 8)
def_to_bigsmp = 1;
- }
}
#endif
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 00000000000..d9acc3bee0f
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
+/*
+ * NOOP APIC driver.
+ *
+ * Does almost nothing and should be substituted by a real apic driver via
+ * probe routine.
+ *
+ * Though in case if apic is disabled (for some reason) we try
+ * to not uglify the caller's code and allow to call (some) apic routines
+ * like self-ipi, etc...
+ */
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+
+static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_allbutself(int vector) { }
+static void noop_send_IPI_all(int vector) { }
+static void noop_send_IPI_self(int vector) { }
+static void noop_apic_wait_icr_idle(void) { }
+static void noop_apic_icr_write(u32 low, u32 id) { }
+
+static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+{
+ return -1;
+}
+
+static u32 noop_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static u64 noop_apic_icr_read(void)
+{
+ return 0;
+}
+
+static int noop_cpu_to_logical_apicid(int cpu)
+{
+ return 0;
+}
+
+static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return 0;
+}
+
+static unsigned int noop_get_apic_id(unsigned long x)
+{
+ return 0;
+}
+
+static int noop_probe(void)
+{
+ /*
+ * NOOP apic should not ever be
+ * enabled via probe routine
+ */
+ return 0;
+}
+
+static int noop_apic_id_registered(void)
+{
+ /*
+ * if we would be really "pedantic"
+ * we should pass read_apic_id() here
+ * but since NOOP suppose APIC ID = 0
+ * lets save a few cycles
+ */
+ return physid_isset(0, phys_cpu_present_map);
+}
+
+static const struct cpumask *noop_target_cpus(void)
+{
+ /* only BSP here */
+ return cpumask_of(0);
+}
+
+static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return physid_isset(apicid, *map);
+}
+
+static unsigned long noop_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ if (cpu != 0)
+ pr_warning("APIC: Vector allocated for non-BSP cpu\n");
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+int noop_apicid_to_node(int logical_apicid)
+{
+ /* we're always on node 0 */
+ return 0;
+}
+
+static u32 noop_apic_read(u32 reg)
+{
+ WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+ return 0;
+}
+
+static void noop_apic_write(u32 reg, u32 v)
+{
+ WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+}
+
+struct apic apic_noop = {
+ .name = "noop",
+ .probe = noop_probe,
+ .acpi_madt_oem_check = NULL,
+
+ .apic_id_registered = noop_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = noop_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = noop_check_apicid_used,
+ .check_apicid_present = noop_check_apicid_present,
+
+ .vector_allocation_domain = noop_vector_allocation_domain,
+ .init_apic_ldr = noop_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = noop_apicid_to_node,
+
+ .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+
+ .phys_pkg_id = noop_phys_pkg_id,
+
+ .mps_oem_check = NULL,
+
+ .get_apic_id = noop_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = noop_send_IPI_mask,
+ .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = noop_send_IPI_allbutself,
+ .send_IPI_all = noop_send_IPI_all,
+ .send_IPI_self = noop_send_IPI_self,
+
+ .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
+
+ /* should be safe */
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = NULL,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = noop_apic_read,
+ .write = noop_apic_write,
+ .icr_read = noop_apic_icr_read,
+ .icr_write = noop_apic_icr_write,
+ .wait_icr_idle = noop_apic_wait_icr_idle,
+ .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
+};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 676cdac385c..38dcecfa581 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
#endif
}
-static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
{
return 0;
}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
return BAD_APICID;
}
-static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
-{
- return physid_mask_of_physid(phys_apicid);
-}
-
/* Mapping from cpu number to logical apicid */
static inline int bigsmp_cpu_to_logical_apicid(int cpu)
{
@@ -106,13 +101,13 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
return cpu_physical_id(cpu);
}
-static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
+static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
/* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0xFFL);
+ physids_promote(0xFFL, retmap);
}
-static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
+static int bigsmp_check_phys_apicid_present(int phys_apicid)
{
return 1;
}
@@ -230,7 +225,7 @@ struct apic apic_bigsmp = {
.apicid_to_node = bigsmp_apicid_to_node,
.cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
.cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
- .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
.setup_portio_remap = NULL,
.check_phys_apicid_present = bigsmp_check_phys_apicid_present,
.enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b4..e85f8fb7f8e 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -466,11 +466,11 @@ static const struct cpumask *es7000_target_cpus(void)
return cpumask_of(smp_processor_id());
}
-static unsigned long
-es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
{
return 0;
}
+
static unsigned long es7000_check_apicid_present(int bit)
{
return physid_isset(bit, phys_cpu_present_map);
@@ -539,14 +539,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
static int cpu_id;
-static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid)
+static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
{
- physid_mask_t mask;
-
- mask = physid_mask_of_physid(cpu_id);
+ physid_set_mask_of_physid(cpu_id, retmap);
++cpu_id;
-
- return mask;
}
/* Mapping from cpu number to logical apicid */
@@ -561,10 +557,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
#endif
}
-static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map)
+static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
/* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0xff);
+ physids_promote(0xFFL, retmap);
}
static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3c8f9e75d03..c0b4468683f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -60,8 +60,6 @@
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/hw_irq.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_irq.h>
#include <asm/apic.h>
@@ -96,6 +94,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
+/* Number of legacy interrupts */
+static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
+/* GSI interrupts */
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
+
#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
@@ -135,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
return pin;
}
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * Most irqs are mapped 1:1 with pins.
- */
-struct irq_cfg {
- struct irq_pin_list *irq_2_pin;
- cpumask_var_t domain;
- cpumask_var_t old_domain;
- unsigned move_cleanup_count;
- u8 vector;
- u8 move_in_progress : 1;
-};
-
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
#ifdef CONFIG_SPARSE_IRQ
static struct irq_cfg irq_cfgx[] = {
@@ -173,6 +162,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
[15] = { .vector = IRQ15_VECTOR, },
};
+void __init io_apic_disable_legacy(void)
+{
+ nr_legacy_irqs = 0;
+ nr_irqs_gsi = 0;
+}
+
int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
@@ -190,7 +185,7 @@ int __init arch_early_irq_init(void)
desc->chip_data = &cfg[i];
zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
- if (i < NR_IRQS_LEGACY)
+ if (i < nr_legacy_irqs)
cpumask_setall(cfg[i].domain);
}
@@ -198,7 +193,7 @@ int __init arch_early_irq_init(void)
}
#ifdef CONFIG_SPARSE_IRQ
-static struct irq_cfg *irq_cfg(unsigned int irq)
+struct irq_cfg *irq_cfg(unsigned int irq)
{
struct irq_cfg *cfg = NULL;
struct irq_desc *desc;
@@ -216,17 +211,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node)
cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
if (cfg) {
- if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
+ if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
kfree(cfg);
cfg = NULL;
- } else if (!alloc_cpumask_var_node(&cfg->old_domain,
+ } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
GFP_ATOMIC, node)) {
free_cpumask_var(cfg->domain);
kfree(cfg);
cfg = NULL;
- } else {
- cpumask_clear(cfg->domain);
- cpumask_clear(cfg->old_domain);
}
}
@@ -353,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
/* end for move_irq_desc */
#else
-static struct irq_cfg *irq_cfg(unsigned int irq)
+struct irq_cfg *irq_cfg(unsigned int irq)
{
return irq < nr_irqs ? irq_cfgx + irq : NULL;
}
@@ -547,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
add_pin_to_irq_node(cfg, node, newapic, newpin);
}
+static void __io_apic_modify_irq(struct irq_pin_list *entry,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ unsigned int reg, pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+ reg &= mask_and;
+ reg |= mask_or;
+ io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+ if (final)
+ final(entry);
+}
+
static void io_apic_modify_irq(struct irq_cfg *cfg,
int mask_and, int mask_or,
void (*final)(struct irq_pin_list *entry))
{
- int pin;
struct irq_pin_list *entry;
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin * 2);
- reg &= mask_and;
- reg |= mask_or;
- io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
- if (final)
- final(entry);
- }
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ __io_apic_modify_irq(entry, mask_and, mask_or, final);
+}
+
+static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
+{
+ __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED, NULL);
+}
+
+static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
+{
+ __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -587,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
}
-static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
- IO_APIC_REDIR_MASKED, NULL);
-}
-
-static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
- IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
-}
-
static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
{
struct irq_cfg *cfg = desc->chip_data;
@@ -867,7 +865,7 @@ static int __init find_isa_irq_apic(int irq, int type)
*/
static int EISA_ELCR(unsigned int irq)
{
- if (irq < NR_IRQS_LEGACY) {
+ if (irq < nr_legacy_irqs) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -1169,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
int cpu, err;
cpumask_var_t tmp_mask;
- if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ if (cfg->move_in_progress)
return -EBUSY;
if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1229,8 +1227,7 @@ next:
return err;
}
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
int err;
unsigned long flags;
@@ -1464,7 +1461,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
}
ioapic_register_intr(irq, desc, trigger);
- if (irq < NR_IRQS_LEGACY)
+ if (irq < nr_legacy_irqs)
disable_8259A_irq(irq);
ioapic_write_entry(apic_id, pin, entry);
@@ -1591,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void)
struct irq_desc *desc;
unsigned int irq;
- if (apic_verbosity == APIC_QUIET)
- return;
-
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1700,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base)
{
int i;
- if (apic_verbosity == APIC_QUIET)
- return;
-
printk(KERN_DEBUG);
for (i = 0; i < 8; i++)
@@ -1716,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
unsigned int i, v, ver, maxlvt;
u64 icr;
- if (apic_verbosity == APIC_QUIET)
- return;
-
printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
v = apic_read(APIC_ID);
@@ -1816,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
printk("\n");
}
-__apicdebuginit(void) print_all_local_APICs(void)
+__apicdebuginit(void) print_local_APICs(int maxcpu)
{
int cpu;
+ if (!maxcpu)
+ return;
+
preempt_disable();
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
preempt_enable();
}
@@ -1831,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void)
unsigned int v;
unsigned long flags;
- if (apic_verbosity == APIC_QUIET)
+ if (!nr_legacy_irqs)
return;
printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1858,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void)
printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
}
-__apicdebuginit(int) print_all_ICs(void)
+static int __initdata show_lapic = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+__apicdebuginit(int) print_ICs(void)
{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
print_PIC();
/* don't print out if apic is not there */
- if (!cpu_has_apic || disable_apic)
+ if (!cpu_has_apic && !apic_from_smp_config())
return 0;
- print_all_local_APICs();
+ print_local_APICs(show_lapic);
print_IO_APIC();
return 0;
}
-fs_initcall(print_all_ICs);
+fs_initcall(print_ICs);
/* Where if anywhere is the i8259 connect in external int mode */
@@ -1894,6 +1908,10 @@ void __init enable_IO_APIC(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
nr_ioapic_registers[apic] = reg_01.bits.entries+1;
}
+
+ if (!nr_legacy_irqs)
+ return;
+
for(apic = 0; apic < nr_ioapics; apic++) {
int pin;
/* See if any of the pins is in ExtINT mode */
@@ -1948,6 +1966,9 @@ void disable_IO_APIC(void)
*/
clear_IO_APIC();
+ if (!nr_legacy_irqs)
+ return;
+
/*
* If the i8259 is routed through an IOAPIC
* Put that IOAPIC in virtual wire mode
@@ -1981,7 +2002,7 @@ void disable_IO_APIC(void)
/*
* Use virtual wire A mode when interrupt remapping is enabled.
*/
- if (cpu_has_apic)
+ if (cpu_has_apic || apic_from_smp_config())
disconnect_bsp_APIC(!intr_remapping_enabled &&
ioapic_i8259.pin != -1);
}
@@ -1994,7 +2015,7 @@ void disable_IO_APIC(void)
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-static void __init setup_ioapic_ids_from_mpc(void)
+void __init setup_ioapic_ids_from_mpc(void)
{
union IO_APIC_reg_00 reg_00;
physid_mask_t phys_id_present_map;
@@ -2003,9 +2024,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
unsigned char old_id;
unsigned long flags;
- if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
+ if (acpi_ioapic)
return;
-
/*
* Don't check I/O APIC IDs for xAPIC systems. They have
* no meaning without the serial APIC bus.
@@ -2017,7 +2037,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
@@ -2044,7 +2064,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (apic->check_apicid_used(phys_id_present_map,
+ if (apic->check_apicid_used(&phys_id_present_map,
mp_ioapics[apic_id].apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
apic_id, mp_ioapics[apic_id].apicid);
@@ -2059,7 +2079,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
mp_ioapics[apic_id].apicid = i;
} else {
physid_mask_t tmp;
- tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
+ apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
mp_ioapics[apic_id].apicid);
@@ -2179,7 +2199,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
struct irq_cfg *cfg;
spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < NR_IRQS_LEGACY) {
+ if (irq < nr_legacy_irqs) {
disable_8259A_irq(irq);
if (i8259A_irq_pending(irq))
was_pending = 1;
@@ -2214,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
*/
#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
+void send_cleanup_vector(struct irq_cfg *cfg)
{
cpumask_var_t cleanup_mask;
if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
unsigned int i;
- cfg->move_cleanup_count = 0;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- cfg->move_cleanup_count++;
for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
} else {
cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
- cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
free_cpumask_var(cleanup_mask);
}
@@ -2258,15 +2274,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
}
}
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
/*
* Either sets desc->affinity to a valid value, and returns
* ->cpu_mask_to_apicid of that, or returns BAD_APICID and
* leaves desc->affinity untouched.
*/
-static unsigned int
+unsigned int
set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
@@ -2419,8 +2432,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
cfg = irq_cfg(irq);
spin_lock(&desc->lock);
- if (!cfg->move_cleanup_count)
- goto unlock;
if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
@@ -2438,7 +2449,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
goto unlock;
}
__get_cpu_var(vector_irq)[vector] = -1;
- cfg->move_cleanup_count--;
unlock:
spin_unlock(&desc->lock);
}
@@ -2446,21 +2456,33 @@ unlock:
irq_exit();
}
-static void irq_complete_move(struct irq_desc **descp)
+static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
{
struct irq_desc *desc = *descp;
struct irq_cfg *cfg = desc->chip_data;
- unsigned vector, me;
+ unsigned me;
if (likely(!cfg->move_in_progress))
return;
- vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
send_cleanup_vector(cfg);
}
+
+static void irq_complete_move(struct irq_desc **descp)
+{
+ __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
+
+ __irq_complete_move(&desc, cfg->vector);
+}
#else
static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
@@ -2476,6 +2498,59 @@ static void ack_apic_edge(unsigned int irq)
atomic_t irq_mis_count;
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+*/
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ if (mp_ioapics[entry->apic].apicver >= 0x20) {
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ if (irq_remapped(irq))
+ io_apic_eoi(entry->apic, entry->pin);
+ else
+ io_apic_eoi(entry->apic, cfg->vector);
+ } else {
+ __mask_and_edge_IO_APIC_irq(entry);
+ __unmask_and_level_IO_APIC_irq(entry);
+ }
+ }
+}
+
+static void eoi_ioapic_irq(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_irq(irq, cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void ack_apic_level(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -2511,6 +2586,19 @@ static void ack_apic_level(unsigned int irq)
* level-triggered interrupt. We mask the source for the time of the
* operation to prevent an edge-triggered interrupt escaping meanwhile.
* The idea is from Manfred Spraul. --macro
+ *
+ * Also in the case when cpu goes offline, fixup_irqs() will forward
+ * any unhandled interrupt on the offlined cpu to the new cpu
+ * destination that is handling the corresponding interrupt. This
+ * interrupt forwarding is done via IPI's. Hence, in this case also
+ * level-triggered io-apic interrupt will be seen as an edge
+ * interrupt in the IRR. And we can't rely on the cpu's EOI
+ * to be broadcasted to the IO-APIC's which will clear the remoteIRR
+ * corresponding to the level-triggered interrupt. Hence on IO-APIC's
+ * supporting EOI register, we do an explicit EOI to clear the
+ * remote IRR and on IO-APIC's which don't have an EOI register,
+ * we use the above logic (mask+edge followed by unmask+level) from
+ * Manfred Spraul to clear the remote IRR.
*/
cfg = desc->chip_data;
i = cfg->vector;
@@ -2522,6 +2610,19 @@ static void ack_apic_level(unsigned int irq)
*/
ack_APIC_irq();
+ /*
+ * Tail end of clearing remote IRR bit (either by delivering the EOI
+ * message via io-apic EOI register write or simulating it using
+ * mask+edge followed by unnask+level logic) manually when the
+ * level triggered interrupt is seen as the edge triggered interrupt
+ * at the cpu.
+ */
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+
+ eoi_ioapic_irq(desc);
+ }
+
/* Now we can move and renable the irq */
if (unlikely(do_unmask_irq)) {
/* Only migrate the irq if the ack has been received.
@@ -2555,41 +2656,9 @@ static void ack_apic_level(unsigned int irq)
move_masked_irq(irq);
unmask_IO_APIC_irq_desc(desc);
}
-
- /* Tail end of version 0x11 I/O APIC bug workaround */
- if (!(v & (1 << (i & 0x1f)))) {
- atomic_inc(&irq_mis_count);
- spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(cfg);
- __unmask_and_level_IO_APIC_irq(cfg);
- spin_unlock(&ioapic_lock);
- }
}
#ifdef CONFIG_INTR_REMAP
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
- struct irq_pin_list *entry;
-
- for_each_irq_pin(entry, cfg->irq_2_pin)
- io_apic_eoi(entry->apic, entry->pin);
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
static void ir_ack_apic_edge(unsigned int irq)
{
ack_APIC_irq();
@@ -2657,7 +2726,7 @@ static inline void init_IO_APIC_traps(void)
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < NR_IRQS_LEGACY)
+ if (irq < nr_legacy_irqs)
make_8259A_irq(irq);
else
/* Strange. Oh, well.. */
@@ -2993,7 +3062,7 @@ out:
* the I/O APIC in all cases now. No actual device should request
* it anyway. --macro
*/
-#define PIC_IRQS (1 << PIC_CASCADE_IR)
+#define PIC_IRQS (1UL << PIC_CASCADE_IR)
void __init setup_IO_APIC(void)
{
@@ -3001,21 +3070,19 @@ void __init setup_IO_APIC(void)
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
-
- io_apic_irqs = ~PIC_IRQS;
+ io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
/*
* Set up IO-APIC IRQ routing.
*/
-#ifdef CONFIG_X86_32
- if (!acpi_ioapic)
- setup_ioapic_ids_from_mpc();
-#endif
+ x86_init.mpparse.setup_ioapic_ids();
+
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
- check_timer();
+ if (nr_legacy_irqs)
+ check_timer();
}
/*
@@ -3116,7 +3183,6 @@ static int __init ioapic_init_sysfs(void)
device_initcall(ioapic_init_sysfs);
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
/*
* Dynamic irq allocate and deallocation
*/
@@ -3146,6 +3212,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
continue;
desc_new = move_irq_desc(desc_new, node);
+ cfg_new = desc_new->chip_data;
if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
@@ -3697,75 +3764,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
}
#endif /* CONFIG_HT_IRQ */
-#ifdef CONFIG_X86_UV
-/*
- * Re-target the irq to the specified CPU and enable the specified MMR located
- * on the specified blade to allow the sending of MSIs to the specified CPU.
- */
-int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
- unsigned long mmr_offset)
-{
- const struct cpumask *eligible_cpu = cpumask_of(cpu);
- struct irq_cfg *cfg;
- int mmr_pnode;
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
- unsigned long flags;
- int err;
-
- BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
- cfg = irq_cfg(irq);
-
- err = assign_irq_vector(irq, cfg, eligible_cpu);
- if (err != 0)
- return err;
-
- spin_lock_irqsave(&vector_lock, flags);
- set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
- irq_name);
- spin_unlock_irqrestore(&vector_lock, flags);
-
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
- entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
-
- mmr_pnode = uv_blade_to_pnode(mmr_blade);
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- return irq;
-}
-
-/*
- * Disable the specified MMR located on the specified blade so that MSIs are
- * longer allowed to be sent.
- */
-void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
-{
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
- int mmr_pnode;
-
- BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- entry->mask = 1;
-
- mmr_pnode = uv_blade_to_pnode(mmr_blade);
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-}
-#endif /* CONFIG_X86_64 */
-
int __init io_apic_get_redir_entries (int ioapic)
{
union IO_APIC_reg_01 reg_01;
@@ -3856,7 +3854,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
- if (irq >= NR_IRQS_LEGACY) {
+ if (irq >= nr_legacy_irqs) {
cfg = desc->chip_data;
if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -3933,7 +3931,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
*/
if (physids_empty(apic_id_map))
- apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic, 0);
@@ -3949,10 +3947,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
* Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (apic->check_apicid_used(apic_id_map, apic_id)) {
+ if (apic->check_apicid_used(&apic_id_map, apic_id)) {
for (i = 0; i < get_physical_broadcast(); i++) {
- if (!apic->check_apicid_used(apic_id_map, i))
+ if (!apic->check_apicid_used(&apic_id_map, i))
break;
}
@@ -3965,7 +3963,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
apic_id = i;
}
- tmp = apic->apicid_to_cpu_present(apic_id);
+ apic->apicid_to_cpu_present(apic_id, &tmp);
physids_or(apic_id_map, apic_id_map, tmp);
if (reg_00.bits.ID != apic_id) {
@@ -4095,7 +4093,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
for (i = 0; i < nr_ioapics; i++) {
res[i].name = mem;
res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- sprintf(mem, "IOAPIC %u", i);
+ snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
}
@@ -4129,18 +4127,17 @@ void __init ioapic_init_mappings(void)
#ifdef CONFIG_X86_32
fake_ioapic_page:
#endif
- ioapic_phys = (unsigned long)
- alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
set_fixmap_nocache(idx, ioapic_phys);
- apic_printk(APIC_VERBOSE,
- "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
+ apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
+ ioapic_phys);
idx++;
ioapic_res->start = ioapic_phys;
- ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+ ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
ioapic_res++;
}
}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index db7220220d0..7ff61d6a188 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
static inline int mce_in_progress(void)
{
-#if defined(CONFIG_X86_NEW_MCE)
+#if defined(CONFIG_X86_MCE)
return atomic_read(&mce_entry) > 0;
#endif
return 0;
@@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
/*
* proc handler for /proc/sys/kernel/nmi
*/
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int old_state;
nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
old_state = nmi_watchdog_enabled;
- proc_dointvec(table, write, file, buffer, length, ppos);
+ proc_dointvec(table, write, buffer, length, ppos);
if (!!old_state == !!nmi_watchdog_enabled)
return 0;
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index ca96e68f0d2..07cdbdcd7a9 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -66,7 +66,6 @@ struct mpc_trans {
unsigned short trans_reserved;
};
-/* x86_quirks member */
static int mpc_record;
static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
@@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void)
}
}
-static int __init numaq_pre_time_init(void)
+static void __init numaq_tsc_init(void)
{
numaq_tsc_disable();
- return 0;
}
static inline int generate_logical_apicid(int quad, int phys_apicid)
@@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m)
quad_local_to_mp_bus_id[quad][local] = m->busid;
}
+/*
+ * Called from mpparse code.
+ * mode = 0: prescan
+ * mode = 1: one mpc entry scanned
+ */
+static void numaq_mpc_record(unsigned int mode)
+{
+ if (!mode)
+ mpc_record = 0;
+ else
+ mpc_record++;
+}
+
static void __init MP_translation_info(struct mpc_trans *m)
{
printk(KERN_INFO
@@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len)
/*
* Read/parse the MPC oem tables
*/
-static void __init
- smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
+static void __init smp_read_mpc_oem(struct mpc_table *mpc)
{
+ struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
int count = sizeof(*oemtable); /* the header size */
unsigned char *oemptr = ((unsigned char *)oemtable) + count;
@@ -250,29 +261,6 @@ static void __init
}
}
-static int __init numaq_setup_ioapic_ids(void)
-{
- /* so can skip it */
- return 1;
-}
-
-static struct x86_quirks numaq_x86_quirks __initdata = {
- .arch_pre_time_init = numaq_pre_time_init,
- .arch_time_init = NULL,
- .arch_pre_intr_init = NULL,
- .arch_memory_setup = NULL,
- .arch_intr_init = NULL,
- .arch_trap_init = NULL,
- .mach_get_smp_config = NULL,
- .mach_find_smp_config = NULL,
- .mpc_record = &mpc_record,
- .mpc_apic_id = mpc_apic_id,
- .mpc_oem_bus_info = mpc_oem_bus_info,
- .mpc_oem_pci_bus = mpc_oem_pci_bus,
- .smp_read_mpc_oem = smp_read_mpc_oem,
- .setup_ioapic_ids = numaq_setup_ioapic_ids,
-};
-
static __init void early_check_numaq(void)
{
/*
@@ -286,8 +274,15 @@ static __init void early_check_numaq(void)
if (smp_found_config)
early_get_smp_config();
- if (found_numaq)
- x86_quirks = &numaq_x86_quirks;
+ if (found_numaq) {
+ x86_init.mpparse.mpc_record = numaq_mpc_record;
+ x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
+ x86_init.mpparse.mpc_apic_id = mpc_apic_id;
+ x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
+ x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
+ x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
+ x86_init.timers.tsc_pre_init = numaq_tsc_init;
+ }
}
int __init get_memcfg_numaq(void)
@@ -339,10 +334,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
return cpu_all_mask;
}
-static inline unsigned long
-numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
{
- return physid_isset(apicid, bitmap);
+ return physid_isset(apicid, *map);
}
static inline unsigned long numaq_check_apicid_present(int bit)
@@ -376,10 +370,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
return apic != 0 && irq == 0;
}
-static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map)
+static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
/* We don't have a good way to do this yet - hack */
- return physids_promote(0xFUL);
+ return physids_promote(0xFUL, retmap);
}
static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -407,18 +401,18 @@ static inline int numaq_apicid_to_node(int logical_apicid)
return logical_apicid >> 4;
}
-static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
+static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
{
int node = numaq_apicid_to_node(logical_apicid);
int cpu = __ffs(logical_apicid & 0xf);
- return physid_mask_of_physid(cpu + 4*node);
+ physid_set_mask_of_physid(cpu + 4*node, retmap);
}
/* Where the IO area was mapped on multiquad, always 0 otherwise */
void *xquad_portio;
-static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
+static inline int numaq_check_phys_apicid_present(int phys_apicid)
{
return 1;
}
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947..1a6559f6768 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -108,7 +108,7 @@ struct apic apic_default = {
.apicid_to_node = default_apicid_to_node,
.cpu_to_logical_apicid = default_cpu_to_logical_apicid,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = default_apicid_to_cpu_present,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
.setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 65edc180fc8..c4cbd3080c1 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -64,16 +64,23 @@ void __init default_setup_apic_routing(void)
apic = &apic_x2apic_phys;
else
apic = &apic_x2apic_cluster;
- printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
}
#endif
if (apic == &apic_flat) {
- if (max_physical_apicid >= 8)
- apic = &apic_physflat;
- printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (num_processors > 8)
+ apic = &apic_physflat;
+ break;
+ case X86_VENDOR_AMD:
+ if (max_physical_apicid >= 8)
+ apic = &apic_physflat;
+ }
}
+ printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
+
if (is_vsmp_box()) {
/* need to update phys_pkg_id */
apic->phys_pkg_id = apicid_phys_pkg_id;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index eafdfbd1ea9..9b419263d90 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
return cpumask_of(0);
}
-static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
{
return 0;
}
@@ -261,18 +261,18 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
return BAD_APICID;
}
-static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map)
+static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
{
/* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0x0F);
+ physids_promote(0x0FL, retmap);
}
-static physid_mask_t summit_apicid_to_cpu_present(int apicid)
+static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
{
- return physid_mask_of_physid(0);
+ physid_set_mask_of_physid(0, retmap);
}
-static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
+static int summit_check_phys_apicid_present(int physical_apicid)
{
return 1;
}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 601159374e8..130c4b93487 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -352,14 +352,14 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
alias.v = uv_read_local_mmr(redir_addrs[i].alias);
- if (alias.s.base == 0) {
+ if (alias.s.enable && alias.s.base == 0) {
*size = (1UL << alias.s.m_alias);
redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
return;
}
}
- BUG();
+ *base = *size = 0;
}
enum map_type {map_wb, map_uc};
@@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode)
map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
}
+static __init void map_mmr_high(int max_pnode)
+{
+ union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
+ int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
+ if (mmr.s.enable)
+ map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
+}
+
static __init void map_mmioh_high(int max_pnode)
{
union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -399,6 +409,12 @@ static __init void map_mmioh_high(int max_pnode)
map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
}
+static __init void map_low_mmrs(void)
+{
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+}
+
static __init void uv_rtc_init(void)
{
long status;
@@ -540,6 +556,8 @@ void __init uv_system_init(void)
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
+ map_low_mmrs();
+
m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
@@ -609,12 +627,12 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
uv_cpu_hub_info(cpu)->m_val = m_val;
- uv_cpu_hub_info(cpu)->n_val = m_val;
+ uv_cpu_hub_info(cpu)->n_val = n_val;
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
uv_cpu_hub_info(cpu)->pnode = pnode;
uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
- uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
+ uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
@@ -643,6 +661,7 @@ void __init uv_system_init(void)
}
map_gru_high(max_pnode);
+ map_mmr_high(max_pnode);
map_mmioh_high(max_pnode);
uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5a..b5b6b23bce5 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
#include <linux/module.h>
#include <linux/poll.h>
-#include <linux/smp_lock.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
static struct apm_user *user_list;
static DEFINE_SPINLOCK(user_list_lock);
+static DEFINE_MUTEX(apm_mutex);
/*
* Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
return -EPERM;
switch (cmd) {
case APM_IOC_STANDBY:
- lock_kernel();
+ mutex_lock(&apm_mutex);
if (as->standbys_read > 0) {
as->standbys_read--;
as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
queue_event(APM_USER_STANDBY, as);
if (standbys_pending <= 0)
standby();
- unlock_kernel();
+ mutex_unlock(&apm_mutex);
break;
case APM_IOC_SUSPEND:
- lock_kernel();
+ mutex_lock(&apm_mutex);
if (as->suspends_read > 0) {
as->suspends_read--;
as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
queue_event(APM_USER_SUSPEND, as);
if (suspends_pending <= 0) {
ret = suspend(1);
+ mutex_unlock(&apm_mutex);
} else {
as->suspend_wait = 1;
+ mutex_unlock(&apm_mutex);
wait_event_interruptible(apm_suspend_waitqueue,
as->suspend_wait == 0);
ret = as->suspend_result;
}
- unlock_kernel();
return ret;
default:
return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
{
struct apm_user *as;
- lock_kernel();
as = kmalloc(sizeof(*as), GFP_KERNEL);
if (as == NULL) {
printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
sizeof(*as));
- unlock_kernel();
return -ENOMEM;
}
as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
user_list = as;
spin_unlock(&user_list_lock);
filp->private_data = as;
- unlock_kernel();
return 0;
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac15..1d2cb383410 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
# Don't trace early stages of a secondary CPU boot
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
endif
# Make sure load_percpu_segment has no stackprotector
@@ -13,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o capflags.o powerflags.o common.o
-obj-y += vmware.o hypervisor.o
+obj-y += vmware.o hypervisor.o sched.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
@@ -27,7 +28,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 22a47c82f3c..c910a716a71 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
* approved Athlon
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
- "processors is not suitable for SMP.\n");
+ " processors is not suitable for SMP.\n");
if (!test_taint(TAINT_UNSAFE_SMP))
add_taint(TAINT_UNSAFE_SMP);
@@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
}
+int amd_get_nb_id(int cpu)
+{
+ int id = 0;
+#ifdef CONFIG_SMP
+ id = per_cpu(cpu_llc_id, cpu);
+#endif
+ return id;
+}
+EXPORT_SYMBOL_GPL(amd_get_nb_id);
+
static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
{
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2055fc2b2e6..9053be5d95c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,7 +13,7 @@
#include <linux/io.h>
#include <asm/stackprotector.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -34,7 +34,6 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/pat.h>
-#include <linux/smp.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
@@ -838,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
}
-#ifdef CONFIG_X86_MCE
/* Init Machine Check Exception if available. */
- mcheck_init(c);
-#endif
+ mcheck_cpu_init(c);
select_idle_routine(c);
@@ -870,7 +867,7 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
- init_hw_perf_counters();
+ init_hw_perf_events();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd040..dca325c0399 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
#include <asm/apic.h>
#include <asm/desc.h>
-static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
-static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
+static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
static DEFINE_PER_CPU(int, cpu_priv_count);
static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220c..8b581d3905c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
#include <linux/cpufreq.h>
#include <linux/compiler.h>
#include <linux/dmi.h>
-#include <trace/power.h>
+#include <trace/events/power.h>
#include <linux/acpi.h>
#include <linux/io.h>
@@ -60,7 +60,6 @@ enum {
};
#define INTEL_MSR_RANGE (0xffff)
-#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
struct acpi_cpufreq_data {
struct acpi_processor_performance *acpi_data;
@@ -71,13 +70,7 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
-struct acpi_msr_data {
- u64 saved_aperf, saved_mperf;
-};
-
-static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
-
-DEFINE_TRACE(power_mark);
+static DEFINE_PER_CPU(struct aperfmperf, old_perf);
/* acpi_perf_data is a pointer to percpu data. */
static struct acpi_processor_performance *acpi_perf_data;
@@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask)
return cmd.val;
}
-struct perf_pair {
- union {
- struct {
- u32 lo;
- u32 hi;
- } split;
- u64 whole;
- } aperf, mperf;
-};
-
/* Called via smp_call_function_single(), on the target CPU */
static void read_measured_perf_ctrs(void *_cur)
{
- struct perf_pair *cur = _cur;
+ struct aperfmperf *am = _cur;
- rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
- rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
+ get_aperfmperf(am);
}
/*
@@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur)
static unsigned int get_measured_perf(struct cpufreq_policy *policy,
unsigned int cpu)
{
- struct perf_pair readin, cur;
- unsigned int perf_percent;
+ struct aperfmperf perf;
+ unsigned long ratio;
unsigned int retval;
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
+ if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
return 0;
- cur.aperf.whole = readin.aperf.whole -
- per_cpu(msr_data, cpu).saved_aperf;
- cur.mperf.whole = readin.mperf.whole -
- per_cpu(msr_data, cpu).saved_mperf;
- per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
- per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
-
-#ifdef __i386__
- /*
- * We dont want to do 64 bit divide with 32 bit kernel
- * Get an approximate value. Return failure in case we cannot get
- * an approximate value.
- */
- if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
- int shift_count;
- u32 h;
+ ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
+ per_cpu(old_perf, cpu) = perf;
- h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
- shift_count = fls(h);
-
- cur.aperf.whole >>= shift_count;
- cur.mperf.whole >>= shift_count;
- }
-
- if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
- int shift_count = 7;
- cur.aperf.split.lo >>= shift_count;
- cur.mperf.split.lo >>= shift_count;
- }
-
- if (cur.aperf.split.lo && cur.mperf.split.lo)
- perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
- else
- perf_percent = 0;
-
-#else
- if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
- int shift_count = 7;
- cur.aperf.whole >>= shift_count;
- cur.mperf.whole >>= shift_count;
- }
-
- if (cur.aperf.whole && cur.mperf.whole)
- perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
- else
- perf_percent = 0;
-
-#endif
-
- retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
+ retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
return retval;
}
@@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
unsigned int next_perf_state = 0; /* Index into perf table */
unsigned int i;
int result = 0;
- struct power_trace it;
dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
@@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
- trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+ trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
@@ -588,6 +523,27 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
},
{ }
};
+
+static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
+{
+ /* Intel Xeon Processor 7100 Series Specification Update
+ * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
+ * AL30: A Machine Check Exception (MCE) Occurring during an
+ * Enhanced Intel SpeedStep Technology Ratio Change May Cause
+ * Both Processor Cores to Lock Up. */
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ if ((c->x86 == 15) &&
+ (c->x86_model == 6) &&
+ (c->x86_mask == 8)) {
+ printk(KERN_INFO "acpi-cpufreq: Intel(R) "
+ "Xeon(R) 7100 Errata AL30, processors may "
+ "lock up on frequency changes: disabling "
+ "acpi-cpufreq.\n");
+ return -ENODEV;
+ }
+ }
+ return 0;
+}
#endif
static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
@@ -599,9 +555,20 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
unsigned int result = 0;
struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
struct acpi_processor_performance *perf;
+#ifdef CONFIG_SMP
+ static int blacklisted;
+#endif
dprintk("acpi_cpufreq_cpu_init\n");
+#ifdef CONFIG_SMP
+ if (blacklisted)
+ return blacklisted;
+ blacklisted = acpi_cpufreq_blacklist(c);
+ if (blacklisted)
+ return blacklisted;
+#endif
+
data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
if (!data)
return -ENOMEM;
@@ -731,12 +698,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
acpi_processor_notify_smm(THIS_MODULE);
/* Check for APERF/MPERF support in hardware */
- if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
- unsigned int ecx;
- ecx = cpuid_ecx(6);
- if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
- acpi_cpufreq_driver.getavg = get_measured_perf;
- }
+ if (cpu_has(c, X86_FEATURE_APERFMPERF))
+ acpi_cpufreq_driver.getavg = get_measured_perf;
dprintk("CPU%u - ACPI performance management activated.\n", cpu);
for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index ce2ed3e4aad..cabd2fa3fc9 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
break;
case 1 ... 15:
- longhaul_version = TYPE_LONGHAUL_V1;
+ longhaul_version = TYPE_LONGHAUL_V2;
if (c->x86_mask < 8) {
cpu_model = CPU_SAMUEL2;
cpuname = "C3 'Samuel 2' [C5B]";
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 2a50ef89100..3f12dabeab5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
return 0;
}
-static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry)
+static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
+ unsigned int entry)
{
- data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
+ powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
}
static void print_basics(struct powernow_k8_data *data)
@@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
goto err_out;
}
+ /* fill in data */
+ data->numps = data->acpi_data.state_count;
+ powernow_k8_acpi_pst_values(data, 0);
+
if (cpu_family == CPU_HW_PSTATE)
ret_val = fill_powernow_table_pstate(data, powernow_table);
else
@@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
powernow_table[data->acpi_data.state_count].index = 0;
data->powernow_table = powernow_table;
- /* fill in data */
- data->numps = data->acpi_data.state_count;
if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
print_basics(data);
- powernow_k8_acpi_pst_values(data, 0);
/* notify BIOS that we exist */
acpi_processor_notify_smm(THIS_MODULE);
@@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
"bad value %d.\n", i, index);
printk(KERN_ERR PFX "Please report to BIOS "
"manufacturer\n");
- invalidate_entry(data, i);
+ invalidate_entry(powernow_table, i);
continue;
}
rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
if (!(hi & HW_PSTATE_VALID_MASK)) {
dprintk("invalid pstate %d, ignoring\n", index);
- invalidate_entry(data, i);
+ invalidate_entry(powernow_table, i);
continue;
}
@@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
struct cpufreq_frequency_table *powernow_table)
{
int i;
- int cntlofreq = 0;
for (i = 0; i < data->acpi_data.state_count; i++) {
u32 fid;
@@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
/* verify frequency is OK */
if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
dprintk("invalid freq %u kHz, ignoring\n", freq);
- invalidate_entry(data, i);
+ invalidate_entry(powernow_table, i);
continue;
}
@@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
* BIOSs are using "off" to indicate invalid */
if (vid == VID_OFF) {
dprintk("invalid vid %u, ignoring\n", vid);
- invalidate_entry(data, i);
+ invalidate_entry(powernow_table, i);
continue;
}
- /* verify only 1 entry from the lo frequency table */
- if (fid < HI_FID_TABLE_BOTTOM) {
- if (cntlofreq) {
- /* if both entries are the same,
- * ignore this one ... */
- if ((freq != powernow_table[cntlofreq].frequency) ||
- (index != powernow_table[cntlofreq].index)) {
- printk(KERN_ERR PFX
- "Too many lo freq table "
- "entries\n");
- return 1;
- }
-
- dprintk("double low frequency table entry, "
- "ignoring it.\n");
- invalidate_entry(data, i);
- continue;
- } else
- cntlofreq = i;
- }
-
if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
printk(KERN_INFO PFX "invalid freq entries "
"%u kHz vs. %u kHz\n", freq,
(unsigned int)
(data->acpi_data.states[i].core_frequency
* 1000));
- invalidate_entry(data, i);
+ invalidate_entry(powernow_table, i);
continue;
}
}
@@ -1042,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data)
* set it to 1 to avoid problems in the future.
* For all others it's a BIOS bug.
*/
- if (!boot_cpu_data.x86 == 0x11)
+ if (boot_cpu_data.x86 != 0x11)
printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
"latency\n");
max_latency = 1;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 6911e91fb4f..3ae5a7a3a50 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void)
return 0;
}
-struct get_freq_data {
- unsigned int speed;
- unsigned int processor;
-};
-
-static void get_freq_data(void *_data)
+static void get_freq_data(void *_speed)
{
- struct get_freq_data *data = _data;
+ unsigned int *speed = _speed;
- data->speed = speedstep_get_frequency(data->processor);
+ *speed = speedstep_get_frequency(speedstep_processor);
}
static unsigned int speedstep_get(unsigned int cpu)
{
- struct get_freq_data data = { .processor = cpu };
+ unsigned int speed;
/* You're supposed to ensure CPU is online. */
- if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0)
+ if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
BUG();
- dprintk("detected %u kHz as current frequency\n", data.speed);
- return data.speed;
+ dprintk("detected %u kHz as current frequency\n", speed);
+ return speed;
}
/**
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 93ba8eeb100..08be922de33 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
}
-unsigned long get_hypervisor_tsc_freq(void)
-{
- if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
- return vmware_get_tsc_khz();
- return 0;
-}
-
static inline void __cpuinit
hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
{
@@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
detect_hypervisor_vendor(c);
hypervisor_set_feature_bits(c);
}
+
+void __init init_hypervisor_platform(void)
+{
+ init_hypervisor(&boot_cpu_data);
+ if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
+ vmware_platform_setup();
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 80a722a071b..40e1835b35e 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
+ if (c->cpuid_level > 6) {
+ unsigned ecx = cpuid_ecx(6);
+ if (ecx & 0x01)
+ set_cpu_cap(c, X86_FEATURE_APERFMPERF);
+ }
+
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (cpu_has_ds) {
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 188a1ca5ad2..4ac6d48fe11 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,8 @@
-obj-y = mce.o
+obj-y = mce.o mce-severity.o
-obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
-obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
-obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
deleted file mode 100644
index b945d5dbc60..00000000000
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Athlon specific Machine Check Exception Reporting
- * (C) Copyright 2002 Dave Jones <davej@redhat.com>
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* Machine Check Handler For AMD Athlon/Duron: */
-static void k7_machine_check(struct pt_regs *regs, long error_code)
-{
- u32 alow, ahigh, high, low;
- u32 mcgstl, mcgsth;
- int recover = 1;
- int i;
-
- rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
- if (mcgstl & (1<<0)) /* Recoverable ? */
- recover = 0;
-
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
- smp_processor_id(), mcgsth, mcgstl);
-
- for (i = 1; i < nr_mce_banks; i++) {
- rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high & (1<<31)) {
- char misc[20];
- char addr[24];
-
- misc[0] = '\0';
- addr[0] = '\0';
-
- if (high & (1<<29))
- recover |= 1;
- if (high & (1<<25))
- recover |= 2;
- high &= ~(1<<31);
-
- if (high & (1<<27)) {
- rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
- snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
- }
- if (high & (1<<26)) {
- rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- snprintf(addr, 24, " at %08x%08x", ahigh, alow);
- }
-
- printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
- smp_processor_id(), i, high, low, misc, addr);
-
- /* Clear it: */
- wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
- /* Serialize: */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
- }
-
- if (recover & 2)
- panic("CPU context corrupt");
- if (recover & 1)
- panic("Unable to continue");
-
- printk(KERN_EMERG "Attempting to continue.\n");
-
- mcgstl &= ~(1<<2);
- wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-
-/* AMD K7 machine check is Intel like: */
-void amd_mcheck_init(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- int i;
-
- if (!cpu_has(c, X86_FEATURE_MCE))
- return;
-
- machine_check_vector = k7_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
- printk(KERN_INFO "Intel machine check architecture supported.\n");
-
- rdmsr(MSR_IA32_MCG_CAP, l, h);
- if (l & (1<<8)) /* Control register present ? */
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- nr_mce_banks = l & 0xff;
-
- /*
- * Clear status for MC index 0 separately, we don't touch CTL,
- * as some K7 Athlons cause spurious MCEs when its enabled:
- */
- if (boot_cpu_data.x86 == 6) {
- wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
- i = 1;
- } else
- i = 0;
-
- for (; i < nr_mce_banks; i++) {
- wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
- }
-
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
-}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a3a235a53f0..472763d9209 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -18,7 +18,12 @@
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/smp.h>
+#include <linux/notifier.h>
+#include <linux/kdebug.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
#include <asm/mce.h>
+#include <asm/apic.h>
/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
@@ -39,44 +44,142 @@ static void inject_mce(struct mce *m)
i->finished = 1;
}
-struct delayed_mce {
- struct timer_list timer;
- struct mce m;
-};
+static void raise_poll(struct mce *m)
+{
+ unsigned long flags;
+ mce_banks_t b;
-/* Inject mce on current CPU */
-static void raise_mce(unsigned long data)
+ memset(&b, 0xff, sizeof(mce_banks_t));
+ local_irq_save(flags);
+ machine_check_poll(0, &b);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static void raise_exception(struct mce *m, struct pt_regs *pregs)
{
- struct delayed_mce *dm = (struct delayed_mce *)data;
- struct mce *m = &dm->m;
- int cpu = m->extcpu;
+ struct pt_regs regs;
+ unsigned long flags;
- inject_mce(m);
- if (m->status & MCI_STATUS_UC) {
- struct pt_regs regs;
+ if (!pregs) {
memset(&regs, 0, sizeof(struct pt_regs));
regs.ip = m->ip;
regs.cs = m->cs;
+ pregs = &regs;
+ }
+ /* in mcheck exeception handler, irq will be disabled */
+ local_irq_save(flags);
+ do_machine_check(pregs, 0);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static cpumask_t mce_inject_cpumask;
+
+static int mce_raise_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct die_args *args = (struct die_args *)data;
+ int cpu = smp_processor_id();
+ struct mce *m = &__get_cpu_var(injectm);
+ if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
+ return NOTIFY_DONE;
+ cpu_clear(cpu, mce_inject_cpumask);
+ if (m->inject_flags & MCJ_EXCEPTION)
+ raise_exception(m, args->regs);
+ else if (m->status)
+ raise_poll(m);
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block mce_raise_nb = {
+ .notifier_call = mce_raise_notify,
+ .priority = 1000,
+};
+
+/* Inject mce on current CPU */
+static int raise_local(void)
+{
+ struct mce *m = &__get_cpu_var(injectm);
+ int context = MCJ_CTX(m->inject_flags);
+ int ret = 0;
+ int cpu = m->extcpu;
+
+ if (m->inject_flags & MCJ_EXCEPTION) {
printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
- do_machine_check(&regs, 0);
+ switch (context) {
+ case MCJ_CTX_IRQ:
+ /*
+ * Could do more to fake interrupts like
+ * calling irq_enter, but the necessary
+ * machinery isn't exported currently.
+ */
+ /*FALL THROUGH*/
+ case MCJ_CTX_PROCESS:
+ raise_exception(m, NULL);
+ break;
+ default:
+ printk(KERN_INFO "Invalid MCE context\n");
+ ret = -EINVAL;
+ }
printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
- } else {
- mce_banks_t b;
- memset(&b, 0xff, sizeof(mce_banks_t));
+ } else if (m->status) {
printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
- machine_check_poll(0, &b);
+ raise_poll(m);
mce_notify_irq();
- printk(KERN_INFO "Finished machine check poll on CPU %d\n",
- cpu);
- }
- kfree(dm);
+ printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
+ } else
+ m->finished = 0;
+
+ return ret;
+}
+
+static void raise_mce(struct mce *m)
+{
+ int context = MCJ_CTX(m->inject_flags);
+
+ inject_mce(m);
+
+ if (context == MCJ_CTX_RANDOM)
+ return;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (m->inject_flags & MCJ_NMI_BROADCAST) {
+ unsigned long start;
+ int cpu;
+ get_online_cpus();
+ mce_inject_cpumask = cpu_online_map;
+ cpu_clear(get_cpu(), mce_inject_cpumask);
+ for_each_online_cpu(cpu) {
+ struct mce *mcpu = &per_cpu(injectm, cpu);
+ if (!mcpu->finished ||
+ MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
+ cpu_clear(cpu, mce_inject_cpumask);
+ }
+ if (!cpus_empty(mce_inject_cpumask))
+ apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR);
+ start = jiffies;
+ while (!cpus_empty(mce_inject_cpumask)) {
+ if (!time_before(jiffies, start + 2*HZ)) {
+ printk(KERN_ERR
+ "Timeout waiting for mce inject NMI %lx\n",
+ *cpus_addr(mce_inject_cpumask));
+ break;
+ }
+ cpu_relax();
+ }
+ raise_local();
+ put_cpu();
+ put_online_cpus();
+ } else
+#endif
+ raise_local();
}
/* Error injection interface */
static ssize_t mce_write(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off)
{
- struct delayed_mce *dm;
struct mce m;
if (!capable(CAP_SYS_ADMIN))
@@ -96,19 +199,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
return -EINVAL;
- dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
- if (!dm)
- return -ENOMEM;
-
/*
* Need to give user space some time to set everything up,
* so do it a jiffie or two later everywhere.
- * Should we use a hrtimer here for better synchronization?
*/
- memcpy(&dm->m, &m, sizeof(struct mce));
- setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
- dm->timer.expires = jiffies + 2;
- add_timer_on(&dm->timer, m.extcpu);
+ schedule_timeout(2);
+ raise_mce(&m);
return usize;
}
@@ -116,6 +212,7 @@ static int inject_init(void)
{
printk(KERN_INFO "Machine check injector initialized\n");
mce_chrdev_ops.write = mce_write;
+ register_die_notifier(&mce_raise_nb);
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 54dcb8ff12e..32996f9fab6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,4 @@
+#include <linux/sysdev.h>
#include <asm/mce.h>
enum severity_level {
@@ -10,6 +11,20 @@ enum severity_level {
MCE_PANIC_SEVERITY,
};
+#define ATTR_LEN 16
+
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank {
+ u64 ctl; /* subevents to enable */
+ unsigned char init; /* initialise bank? */
+ struct sysdev_attribute attr; /* sysdev attribute */
+ char attrname[ATTR_LEN]; /* attribute name */
+};
+
int mce_severity(struct mce *a, int tolerant, char **msg);
+struct dentry *mce_get_debugfs_dir(void);
extern int mce_ser;
+
+extern struct mce_bank *mce_banks;
+
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index ff0807f9705..8a85dd1b1aa 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
}
}
+#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
if (*pos >= ARRAY_SIZE(severities))
@@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void)
{
struct dentry *dmce = NULL, *fseverities_coverage = NULL;
- dmce = debugfs_create_dir("mce", NULL);
+ dmce = mce_get_debugfs_dir();
if (dmce == NULL)
goto err_out;
fseverities_coverage = debugfs_create_file("severities-coverage",
@@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void)
return 0;
err_out:
- if (fseverities_coverage)
- debugfs_remove(fseverities_coverage);
- if (dmce)
- debugfs_remove(dmce);
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 01213048f62..0bcaa387586 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -34,6 +34,7 @@
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/debugfs.h>
#include <asm/processor.h>
#include <asm/hw_irq.h>
@@ -45,21 +46,11 @@
#include "mce-internal.h"
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
- printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
- smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
- unexpected_machine_check;
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
int mce_disabled __read_mostly;
-#ifdef CONFIG_X86_NEW_MCE
-
#define MISC_MCELOG_MINOR 227
#define SPINUNIT 100 /* 100ns */
@@ -77,7 +68,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
*/
static int tolerant __read_mostly = 1;
static int banks __read_mostly;
-static u64 *bank __read_mostly;
static int rip_msr __read_mostly;
static int mce_bootlog __read_mostly = -1;
static int monarch_timeout __read_mostly = -1;
@@ -87,28 +77,43 @@ int mce_cmci_disabled __read_mostly;
int mce_ignore_ce __read_mostly;
int mce_ser __read_mostly;
+struct mce_bank *mce_banks __read_mostly;
+
/* User mode helper program triggered by machine check event */
static unsigned long mce_need_notify;
static char mce_helper[128];
static char *mce_helper_argv[2] = { mce_helper, NULL };
-static unsigned long dont_init_banks;
-
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
+
+static int default_decode_mce(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ pr_emerg("No human readable MCE decoding support on this CPU type.\n");
+ pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block mce_dec_nb = {
+ .notifier_call = default_decode_mce,
+ .priority = -1,
+};
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
-static inline int skip_bank_init(int i)
-{
- return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
-}
-
static DEFINE_PER_CPU(struct work_struct, mce_work);
/* Do initial initialization of a struct mce */
@@ -147,6 +152,9 @@ void mce_log(struct mce *mce)
{
unsigned next, entry;
+ /* Emit the trace record: */
+ trace_mce_record(mce);
+
mce->finished = 0;
wmb();
for (;;) {
@@ -185,47 +193,58 @@ void mce_log(struct mce *mce)
static void print_mce(struct mce *m)
{
- printk(KERN_EMERG
- "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
+
if (m->ip) {
- printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
- !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
+ pr_emerg("RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
- printk(KERN_EMERG "TSC %llx ", m->tsc);
+
+ pr_emerg("TSC %llx ", m->tsc);
if (m->addr)
- printk(KERN_CONT "ADDR %llx ", m->addr);
+ pr_cont("ADDR %llx ", m->addr);
if (m->misc)
- printk(KERN_CONT "MISC %llx ", m->misc);
- printk(KERN_CONT "\n");
- printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
- m->cpuvendor, m->cpuid, m->time, m->socketid,
- m->apicid);
+ pr_cont("MISC %llx ", m->misc);
+
+ pr_cont("\n");
+ pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
+
+ /*
+ * Print out human-readable details about the MCE error,
+ * (if the CPU has an implementation for that)
+ */
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
}
static void print_mce_head(void)
{
- printk(KERN_EMERG "\nHARDWARE ERROR\n");
+ pr_emerg("\nHARDWARE ERROR\n");
}
static void print_mce_tail(void)
{
- printk(KERN_EMERG "This is not a software problem!\n"
- "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+ pr_emerg("This is not a software problem!\n");
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_paniced;
+static int fake_panic;
+static atomic_t mce_fake_paniced;
+
/* Panic in progress. Enable interrupts and wait for final IPI */
static void wait_for_panic(void)
{
long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
preempt_disable();
local_irq_enable();
while (timeout-- > 0)
@@ -239,15 +258,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
{
int i;
- /*
- * Make sure only one CPU runs in machine check panic
- */
- if (atomic_add_return(1, &mce_paniced) > 1)
- wait_for_panic();
- barrier();
+ if (!fake_panic) {
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_inc_return(&mce_paniced) > 1)
+ wait_for_panic();
+ barrier();
- bust_spinlocks(1);
- console_verbose();
+ bust_spinlocks(1);
+ console_verbose();
+ } else {
+ /* Don't log too much for fake panic */
+ if (atomic_inc_return(&mce_fake_paniced) > 1)
+ return;
+ }
print_mce_head();
/* First print corrected ones that are still unlogged */
for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -274,9 +299,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
print_mce_tail();
if (exp)
printk(KERN_EMERG "Machine check: %s\n", exp);
- if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
- panic(msg);
+ if (!fake_panic) {
+ if (panic_timeout == 0)
+ panic_timeout = mce_panic_timeout;
+ panic(msg);
+ } else
+ printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
}
/* Support code for software error injection */
@@ -284,13 +312,14 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
static int msr_to_offset(u32 msr)
{
unsigned bank = __get_cpu_var(injectm.bank);
+
if (msr == rip_msr)
return offsetof(struct mce, ip);
- if (msr == MSR_IA32_MC0_STATUS + bank*4)
+ if (msr == MSR_IA32_MCx_STATUS(bank))
return offsetof(struct mce, status);
- if (msr == MSR_IA32_MC0_ADDR + bank*4)
+ if (msr == MSR_IA32_MCx_ADDR(bank))
return offsetof(struct mce, addr);
- if (msr == MSR_IA32_MC0_MISC + bank*4)
+ if (msr == MSR_IA32_MCx_MISC(bank))
return offsetof(struct mce, misc);
if (msr == MSR_IA32_MCG_STATUS)
return offsetof(struct mce, mcgstatus);
@@ -301,13 +330,25 @@ static int msr_to_offset(u32 msr)
static u64 mce_rdmsrl(u32 msr)
{
u64 v;
+
if (__get_cpu_var(injectm).finished) {
int offset = msr_to_offset(msr);
+
if (offset < 0)
return 0;
return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
}
- rdmsrl(msr, v);
+
+ if (rdmsrl_safe(msr, &v)) {
+ WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
+ /*
+ * Return zero in case the access faulted. This should
+ * not happen normally but can happen if the CPU does
+ * something weird, or if the code is buggy.
+ */
+ v = 0;
+ }
+
return v;
}
@@ -315,6 +356,7 @@ static void mce_wrmsrl(u32 msr, u64 v)
{
if (__get_cpu_var(injectm).finished) {
int offset = msr_to_offset(msr);
+
if (offset >= 0)
*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
return;
@@ -411,7 +453,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
m->ip = mce_rdmsrl(rip_msr);
}
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_X86_LOCAL_APIC
/*
* Called after interrupts have been reenabled again
* when a MCE happened during an interrupts off region
@@ -495,7 +537,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
for (i = 0; i < banks; i++) {
- if (!bank[i] || !test_bit(i, *b))
+ if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
m.misc = 0;
@@ -504,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.tsc = 0;
barrier();
- m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
if (!(m.status & MCI_STATUS_VAL))
continue;
@@ -519,9 +561,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
continue;
if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
+ m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+ m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
if (!(flags & MCP_TIMESTAMP))
m.tsc = 0;
@@ -537,7 +579,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
/*
* Clear state for this bank.
*/
- mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
}
/*
@@ -558,7 +600,7 @@ static int mce_no_way_out(struct mce *m, char **msg)
int i;
for (i = 0; i < banks; i++) {
- m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
return 1;
}
@@ -618,7 +660,7 @@ out:
* This way we prevent any potential data corruption in a unrecoverable case
* and also makes sure always all CPU's errors are examined.
*
- * Also this detects the case of an machine check event coming from outer
+ * Also this detects the case of a machine check event coming from outer
* space (not detected by any CPUs) In this case some external agent wants
* us to shut down, so panic too.
*
@@ -671,7 +713,7 @@ static void mce_reign(void)
* No machine check event found. Must be some external
* source or one CPU is hung. Panic.
*/
- if (!m && tolerant < 3)
+ if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
mce_panic("Machine check from unknown source", NULL, NULL);
/*
@@ -705,7 +747,7 @@ static int mce_start(int *no_way_out)
* global_nwo should be updated before mce_callin
*/
smp_wmb();
- order = atomic_add_return(1, &mce_callin);
+ order = atomic_inc_return(&mce_callin);
/*
* Wait for everyone.
@@ -842,7 +884,7 @@ static void mce_clear_state(unsigned long *toclear)
for (i = 0; i < banks; i++) {
if (test_bit(i, toclear))
- mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
}
}
@@ -895,11 +937,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_setup(&m);
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- no_way_out = mce_no_way_out(&m, &msg);
-
final = &__get_cpu_var(mces_seen);
*final = m;
+ no_way_out = mce_no_way_out(&m, &msg);
+
barrier();
/*
@@ -916,14 +958,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
order = mce_start(&no_way_out);
for (i = 0; i < banks; i++) {
__clear_bit(i, toclear);
- if (!bank[i])
+ if (!mce_banks[i].ctl)
continue;
m.misc = 0;
m.addr = 0;
m.bank = i;
- m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
@@ -964,9 +1006,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
kill_it = 1;
if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
+ m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+ m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
/*
* Action optional error. Queue address for later processing.
@@ -1091,10 +1133,10 @@ void mce_log_therm_throt_event(__u64 status)
*/
static int check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mcheck_timer(unsigned long data)
+static void mce_start_timer(unsigned long data)
{
struct timer_list *t = &per_cpu(mce_timer, data);
int *n;
@@ -1110,7 +1152,7 @@ static void mcheck_timer(unsigned long data)
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(next_interval);
+ n = &__get_cpu_var(mce_next_interval);
if (mce_notify_irq())
*n = max(*n/2, HZ/100);
else
@@ -1159,10 +1201,26 @@ int mce_notify_irq(void)
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
+static int __cpuinit __mcheck_cpu_mce_banks_init(void)
+{
+ int i;
+
+ mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
+ if (!mce_banks)
+ return -ENOMEM;
+ for (i = 0; i < banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ b->ctl = -1ULL;
+ b->init = 1;
+ }
+ return 0;
+}
+
/*
* Initialize Machine Checks for a CPU.
*/
-static int mce_cap_init(void)
+static int __cpuinit __mcheck_cpu_cap_init(void)
{
unsigned b;
u64 cap;
@@ -1170,7 +1228,8 @@ static int mce_cap_init(void)
rdmsrl(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+ if (!banks)
+ printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
if (b > MAX_NR_BANKS) {
printk(KERN_WARNING
@@ -1182,11 +1241,11 @@ static int mce_cap_init(void)
/* Don't support asymmetric configurations today */
WARN_ON(banks != 0 && b != banks);
banks = b;
- if (!bank) {
- bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
- if (!bank)
- return -ENOMEM;
- memset(bank, 0xff, banks * sizeof(u64));
+ if (!mce_banks) {
+ int err = __mcheck_cpu_mce_banks_init();
+
+ if (err)
+ return err;
}
/* Use accurate RIP reporting if available. */
@@ -1199,7 +1258,7 @@ static int mce_cap_init(void)
return 0;
}
-static void mce_init(void)
+static void __mcheck_cpu_init_generic(void)
{
mce_banks_t all_banks;
u64 cap;
@@ -1218,15 +1277,17 @@ static void mce_init(void)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
- if (skip_bank_init(i))
+ struct mce_bank *b = &mce_banks[i];
+
+ if (!b->init)
continue;
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
+ wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
}
}
/* Add per CPU specific workarounds here */
-static int mce_cpu_quirks(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1241,7 +1302,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
* trips off incorrectly with the IOMMU & 3ware
* & Cerberus:
*/
- clear_bit(10, (unsigned long *)&bank[4]);
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
if (c->x86 <= 17 && mce_bootlog < 0) {
/*
@@ -1255,7 +1316,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
* by default.
*/
if (c->x86 == 6 && banks > 0)
- bank[0] = 0;
+ mce_banks[0].ctl = 0;
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1268,8 +1329,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
* valid event later, merely don't write CTL0.
*/
- if (c->x86 == 6 && c->x86_model < 0x1A)
- __set_bit(0, &dont_init_banks);
+ if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
+ mce_banks[0].init = 0;
/*
* All newer Intel systems support MCE broadcasting. Enable
@@ -1294,7 +1355,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
return 0;
}
-static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
+static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
return;
@@ -1308,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
}
}
-static void mce_cpu_features(struct cpuinfo_x86 *c)
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
@@ -1322,10 +1383,10 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
}
}
-static void mce_init_timer(void)
+static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(next_interval);
+ int *n = &__get_cpu_var(mce_next_interval);
if (mce_ignore_ce)
return;
@@ -1333,36 +1394,48 @@ static void mce_init_timer(void)
*n = check_interval * HZ;
if (!*n)
return;
- setup_timer(t, mcheck_timer, smp_processor_id());
+ setup_timer(t, mce_start_timer, smp_processor_id());
t->expires = round_jiffies(jiffies + *n);
add_timer_on(t, smp_processor_id());
}
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+ printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+ smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+ unexpected_machine_check;
+
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off:
*/
-void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
{
if (mce_disabled)
return;
- mce_ancient_init(c);
+ __mcheck_cpu_ancient_init(c);
if (!mce_available(c))
return;
- if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
+ if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
mce_disabled = 1;
return;
}
machine_check_vector = do_machine_check;
- mce_init();
- mce_cpu_features(c);
- mce_init_timer();
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(c);
+ __mcheck_cpu_init_timer();
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
+
}
/*
@@ -1551,8 +1624,10 @@ static struct miscdevice mce_log_device = {
*/
static int __init mcheck_enable(char *str)
{
- if (*str == 0)
+ if (*str == 0) {
enable_p5_mce();
+ return 1;
+ }
if (*str == '=')
str++;
if (!strcmp(str, "off"))
@@ -1580,6 +1655,15 @@ static int __init mcheck_enable(char *str)
}
__setup("mce", mcheck_enable);
+int __init mcheck_init(void)
+{
+ atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
+
+ mcheck_intel_therm_init();
+
+ return 0;
+}
+
/*
* Sysfs support
*/
@@ -1588,25 +1672,27 @@ __setup("mce", mcheck_enable);
* Disable machine checks on suspend and shutdown. We can't really handle
* them later.
*/
-static int mce_disable(void)
+static int mce_disable_error_reporting(void)
{
int i;
for (i = 0; i < banks; i++) {
- if (!skip_bank_init(i))
- wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrl(MSR_IA32_MCx_CTL(i), 0);
}
return 0;
}
static int mce_suspend(struct sys_device *dev, pm_message_t state)
{
- return mce_disable();
+ return mce_disable_error_reporting();
}
static int mce_shutdown(struct sys_device *dev)
{
- return mce_disable();
+ return mce_disable_error_reporting();
}
/*
@@ -1616,8 +1702,8 @@ static int mce_shutdown(struct sys_device *dev)
*/
static int mce_resume(struct sys_device *dev)
{
- mce_init();
- mce_cpu_features(&current_cpu_data);
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(&current_cpu_data);
return 0;
}
@@ -1627,8 +1713,8 @@ static void mce_cpu_restart(void *data)
del_timer_sync(&__get_cpu_var(mce_timer));
if (!mce_available(&current_cpu_data))
return;
- mce_init();
- mce_init_timer();
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_timer();
}
/* Reinit MCEs after user configuration changes */
@@ -1654,7 +1740,7 @@ static void mce_enable_ce(void *all)
cmci_reenable();
cmci_recheck();
if (all)
- mce_init_timer();
+ __mcheck_cpu_init_timer();
}
static struct sysdev_class mce_sysclass = {
@@ -1669,14 +1755,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);
__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-static struct sysdev_attribute *bank_attrs;
+static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+{
+ return container_of(attr, struct mce_bank, attr);
+}
static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
char *buf)
{
- u64 b = bank[attr - bank_attrs];
-
- return sprintf(buf, "%llx\n", b);
+ return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
}
static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
@@ -1687,7 +1774,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
if (strict_strtoull(buf, 0, &new) < 0)
return -EINVAL;
- bank[attr - bank_attrs] = new;
+ attr_to_bank(attr)->ctl = new;
mce_restart();
return size;
@@ -1829,7 +1916,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
}
for (j = 0; j < banks; j++) {
err = sysdev_create_file(&per_cpu(mce_dev, cpu),
- &bank_attrs[j]);
+ &mce_banks[j].attr);
if (err)
goto error2;
}
@@ -1838,10 +1925,10 @@ static __cpuinit int mce_create_device(unsigned int cpu)
return 0;
error2:
while (--j >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
sysdev_unregister(&per_cpu(mce_dev, cpu));
@@ -1859,29 +1946,32 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
sysdev_unregister(&per_cpu(mce_dev, cpu));
cpumask_clear_cpu(cpu, mce_dev_initialized);
}
/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void __cpuinit mce_disable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
if (!mce_available(&current_cpu_data))
return;
+
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
for (i = 0; i < banks; i++) {
- if (!skip_bank_init(i))
- wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrl(MSR_IA32_MCx_CTL(i), 0);
}
}
-static void mce_reenable_cpu(void *h)
+static void __cpuinit mce_reenable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
@@ -1892,8 +1982,10 @@ static void mce_reenable_cpu(void *h)
if (!(action & CPU_TASKS_FROZEN))
cmci_reenable();
for (i = 0; i < banks; i++) {
- if (!skip_bank_init(i))
- wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
}
}
@@ -1925,7 +2017,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
t->expires = round_jiffies(jiffies +
- __get_cpu_var(next_interval));
+ __get_cpu_var(mce_next_interval));
add_timer_on(t, cpu);
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
@@ -1941,38 +2033,24 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
.notifier_call = mce_cpu_callback,
};
-static __init int mce_init_banks(void)
+static __init void mce_init_banks(void)
{
int i;
- bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
- GFP_KERNEL);
- if (!bank_attrs)
- return -ENOMEM;
-
for (i = 0; i < banks; i++) {
- struct sysdev_attribute *a = &bank_attrs[i];
+ struct mce_bank *b = &mce_banks[i];
+ struct sysdev_attribute *a = &b->attr;
- a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
- if (!a->attr.name)
- goto nomem;
+ a->attr.name = b->attrname;
+ snprintf(b->attrname, ATTR_LEN, "bank%d", i);
a->attr.mode = 0644;
a->show = show_bank;
a->store = set_bank;
}
- return 0;
-
-nomem:
- while (--i >= 0)
- kfree(bank_attrs[i].attr.name);
- kfree(bank_attrs);
- bank_attrs = NULL;
-
- return -ENOMEM;
}
-static __init int mce_init_device(void)
+static __init int mcheck_init_device(void)
{
int err;
int i = 0;
@@ -1982,9 +2060,7 @@ static __init int mce_init_device(void)
zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
- err = mce_init_banks();
- if (err)
- return err;
+ mce_init_banks();
err = sysdev_class_register(&mce_sysclass);
if (err)
@@ -2002,59 +2078,67 @@ static __init int mce_init_device(void)
return err;
}
-device_initcall(mce_init_device);
-
-#else /* CONFIG_X86_OLD_MCE: */
+device_initcall(mcheck_init_device);
-int nr_mce_banks;
-EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+ mce_disabled = 1;
+ return 1;
+}
+__setup("nomce", mcheck_disable);
-/* This has to be run for each processor */
-void mcheck_init(struct cpuinfo_x86 *c)
+#ifdef CONFIG_DEBUG_FS
+struct dentry *mce_get_debugfs_dir(void)
{
- if (mce_disabled)
- return;
+ static struct dentry *dmce;
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- amd_mcheck_init(c);
- break;
+ if (!dmce)
+ dmce = debugfs_create_dir("mce", NULL);
- case X86_VENDOR_INTEL:
- if (c->x86 == 5)
- intel_p5_mcheck_init(c);
- if (c->x86 == 6)
- intel_p6_mcheck_init(c);
- if (c->x86 == 15)
- intel_p4_mcheck_init(c);
- break;
+ return dmce;
+}
- case X86_VENDOR_CENTAUR:
- if (c->x86 == 5)
- winchip_mcheck_init(c);
- break;
+static void mce_reset(void)
+{
+ cpu_missing = 0;
+ atomic_set(&mce_fake_paniced, 0);
+ atomic_set(&mce_executing, 0);
+ atomic_set(&mce_callin, 0);
+ atomic_set(&global_nwo, 0);
+}
- default:
- break;
- }
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
+static int fake_panic_get(void *data, u64 *val)
+{
+ *val = fake_panic;
+ return 0;
}
-static int __init mcheck_enable(char *str)
+static int fake_panic_set(void *data, u64 val)
{
- mce_p5_enabled = 1;
- return 1;
+ mce_reset();
+ fake_panic = val;
+ return 0;
}
-__setup("mce", mcheck_enable);
-#endif /* CONFIG_X86_OLD_MCE */
+DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
+ fake_panic_set, "%llu\n");
-/*
- * Old style boot options parsing. Only for compatibility.
- */
-static int __init mcheck_disable(char *str)
+static int __init mcheck_debugfs_init(void)
{
- mce_disabled = 1;
- return 1;
+ struct dentry *dmce, *ffake_panic;
+
+ dmce = mce_get_debugfs_dir();
+ if (!dmce)
+ return -ENOMEM;
+ ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
+ &fake_panic_fops);
+ if (!ffake_panic)
+ return -ENOMEM;
+
+ return 0;
}
-__setup("nomce", mcheck_disable);
+late_initcall(mcheck_debugfs_init);
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 1fecba404fd..83a3d1f4efc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
struct threshold_block *blocks;
cpumask_var_t cpus;
};
-static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
#ifdef CONFIG_SMP
static unsigned char shared_bank[NR_BANKS] = {
@@ -489,8 +489,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
int i, err = 0;
struct threshold_bank *b = NULL;
char name[32];
+#ifdef CONFIG_SMP
struct cpuinfo_x86 *c = &cpu_data(cpu);
-
+#endif
sprintf(name, "threshold_bank%i", bank);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0f7a3..7c785634af2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
+#include <linux/sched.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -90,7 +91,7 @@ static void cmci_discover(int banks, int boot)
if (test_bit(i, owned))
continue;
- rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
if (val & CMCI_EN) {
@@ -101,8 +102,8 @@ static void cmci_discover(int banks, int boot)
}
val |= CMCI_EN | CMCI_THRESHOLD;
- wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
- rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ wrmsrl(MSR_IA32_MCx_CTL2(i), val);
+ rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & CMCI_EN) {
@@ -152,9 +153,9 @@ void cmci_clear(void)
if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
continue;
/* Disable CMCI */
- rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ rdmsrl(MSR_IA32_MCx_CTL2(i), val);
val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
- wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
spin_unlock_irqrestore(&cmci_discover_lock, flags);
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
deleted file mode 100644
index f5f2d6f71fb..00000000000
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Non Fatal Machine Check Exception Reporting
- *
- * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
- *
- * This file contains routines to check for non-fatal MCEs every 15s
- *
- */
-#include <linux/interrupt.h>
-#include <linux/workqueue.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-static int firstbank;
-
-#define MCE_RATE (15*HZ) /* timer rate is 15s */
-
-static void mce_checkregs(void *info)
-{
- u32 low, high;
- int i;
-
- for (i = firstbank; i < nr_mce_banks; i++) {
- rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
-
- if (!(high & (1<<31)))
- continue;
-
- printk(KERN_INFO "MCE: The hardware reports a non fatal, "
- "correctable incident occurred on CPU %d.\n",
- smp_processor_id());
-
- printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
-
- /*
- * Scrub the error so we don't pick it up in MCE_RATE
- * seconds time:
- */
- wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-
- /* Serialize: */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
-}
-
-static void mce_work_fn(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
-
-static void mce_work_fn(struct work_struct *work)
-{
- on_each_cpu(mce_checkregs, NULL, 1);
- schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
-}
-
-static int __init init_nonfatal_mce_checker(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- /* Check for MCE support */
- if (!cpu_has(c, X86_FEATURE_MCE))
- return -ENODEV;
-
- /* Check for PPro style MCA */
- if (!cpu_has(c, X86_FEATURE_MCA))
- return -ENODEV;
-
- /* Some Athlons misbehave when we frob bank 0 */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 == 6)
- firstbank = 1;
- else
- firstbank = 0;
-
- /*
- * Check for non-fatal errors every MCE_RATE s
- */
- schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
- printk(KERN_INFO "Machine check exception polling timer started.\n");
-
- return 0;
-}
-module_init(init_nonfatal_mce_checker);
-
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
deleted file mode 100644
index 4482aea9aa2..00000000000
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * P4 specific Machine Check Exception Reporting
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* as supported by the P4/Xeon family */
-struct intel_mce_extended_msrs {
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esi;
- u32 edi;
- u32 ebp;
- u32 esp;
- u32 eflags;
- u32 eip;
- /* u32 *reserved[]; */
-};
-
-static int mce_num_extended_msrs;
-
-/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
-{
- u32 h;
-
- rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
- rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
- rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
- rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
- rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
- rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
- rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
- rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
- rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
- rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
-}
-
-static void intel_machine_check(struct pt_regs *regs, long error_code)
-{
- u32 alow, ahigh, high, low;
- u32 mcgstl, mcgsth;
- int recover = 1;
- int i;
-
- rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
- if (mcgstl & (1<<0)) /* Recoverable ? */
- recover = 0;
-
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
- smp_processor_id(), mcgsth, mcgstl);
-
- if (mce_num_extended_msrs > 0) {
- struct intel_mce_extended_msrs dbg;
-
- intel_get_extended_msrs(&dbg);
-
- printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
- "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
- "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
- smp_processor_id(), dbg.eip, dbg.eflags,
- dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
- dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
- }
-
- for (i = 0; i < nr_mce_banks; i++) {
- rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high & (1<<31)) {
- char misc[20];
- char addr[24];
-
- misc[0] = addr[0] = '\0';
- if (high & (1<<29))
- recover |= 1;
- if (high & (1<<25))
- recover |= 2;
- high &= ~(1<<31);
- if (high & (1<<27)) {
- rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
- snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
- }
- if (high & (1<<26)) {
- rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- snprintf(addr, 24, " at %08x%08x", ahigh, alow);
- }
- printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
- smp_processor_id(), i, high, low, misc, addr);
- }
- }
-
- if (recover & 2)
- panic("CPU context corrupt");
- if (recover & 1)
- panic("Unable to continue");
-
- printk(KERN_EMERG "Attempting to continue.\n");
-
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
- * recoverable/continuable.This will allow BIOS to look at the MSRs
- * for errors if the OS could not log the error.
- */
- for (i = 0; i < nr_mce_banks; i++) {
- u32 msr;
- msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr(msr, low, high);
- if (high&(1<<31)) {
- /* Clear it */
- wrmsr(msr, 0UL, 0UL);
- /* Serialize */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
- }
- mcgstl &= ~(1<<2);
- wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- int i;
-
- machine_check_vector = intel_machine_check;
- wmb();
-
- printk(KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr(MSR_IA32_MCG_CAP, l, h);
- if (l & (1<<8)) /* Control register present ? */
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- nr_mce_banks = l & 0xff;
-
- for (i = 0; i < nr_mce_banks; i++) {
- wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
- }
-
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
-
- /* Check for P4/Xeon extended MCE MSRs */
- rdmsr(MSR_IA32_MCG_CAP, l, h);
- if (l & (1<<9)) {/* MCG_EXT_P */
- mce_num_extended_msrs = (l >> 16) & 0xff;
- printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
- " available\n",
- smp_processor_id(), mce_num_extended_msrs);
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
- /* Check for P4/Xeon Thermal monitor */
- intel_init_thermal(c);
-#endif
- }
-}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
deleted file mode 100644
index 01e4f817818..00000000000
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * P6 specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* Machine Check Handler For PII/PIII */
-static void intel_machine_check(struct pt_regs *regs, long error_code)
-{
- u32 alow, ahigh, high, low;
- u32 mcgstl, mcgsth;
- int recover = 1;
- int i;
-
- rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
- if (mcgstl & (1<<0)) /* Recoverable ? */
- recover = 0;
-
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
- smp_processor_id(), mcgsth, mcgstl);
-
- for (i = 0; i < nr_mce_banks; i++) {
- rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high & (1<<31)) {
- char misc[20];
- char addr[24];
-
- misc[0] = '\0';
- addr[0] = '\0';
-
- if (high & (1<<29))
- recover |= 1;
- if (high & (1<<25))
- recover |= 2;
- high &= ~(1<<31);
-
- if (high & (1<<27)) {
- rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
- snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
- }
- if (high & (1<<26)) {
- rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- snprintf(addr, 24, " at %08x%08x", ahigh, alow);
- }
-
- printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
- smp_processor_id(), i, high, low, misc, addr);
- }
- }
-
- if (recover & 2)
- panic("CPU context corrupt");
- if (recover & 1)
- panic("Unable to continue");
-
- printk(KERN_EMERG "Attempting to continue.\n");
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
- * recoverable/continuable.This will allow BIOS to look at the MSRs
- * for errors if the OS could not log the error:
- */
- for (i = 0; i < nr_mce_banks; i++) {
- unsigned int msr;
-
- msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr(msr, low, high);
- if (high & (1<<31)) {
- /* Clear it: */
- wrmsr(msr, 0UL, 0UL);
- /* Serialize: */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
- }
- mcgstl &= ~(1<<2);
- wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-/* Set up machine check reporting for processors with Intel style MCE: */
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- int i;
-
- /* Check for MCE support */
- if (!cpu_has(c, X86_FEATURE_MCE))
- return;
-
- /* Check for PPro style MCA */
- if (!cpu_has(c, X86_FEATURE_MCA))
- return;
-
- /* Ok machine check is available */
- machine_check_vector = intel_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
- printk(KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr(MSR_IA32_MCG_CAP, l, h);
- if (l & (1<<8)) /* Control register present ? */
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- nr_mce_banks = l & 0xff;
-
- /*
- * Following the example in IA-32 SDM Vol 3:
- * - MC0_CTL should not be written
- * - Status registers on all banks should be cleared on reset
- */
- for (i = 1; i < nr_mce_banks; i++)
- wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-
- for (i = 0; i < nr_mce_banks; i++)
- wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
-
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
-}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 5957a93e517..4fef985fc22 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,20 +34,33 @@
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
-static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
-static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
-static DEFINE_PER_CPU(bool, thermal_throttle_active);
+/*
+ * Current thermal throttling state:
+ */
+struct thermal_state {
+ bool is_throttled;
+
+ u64 next_check;
+ unsigned long throttle_count;
+ unsigned long last_throttle_count;
+};
+
+static DEFINE_PER_CPU(struct thermal_state, thermal_state);
+
+static atomic_t therm_throt_en = ATOMIC_INIT(0);
-static atomic_t therm_throt_en = ATOMIC_INIT(0);
+static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
#define define_therm_throt_sysdev_one_ro(_name) \
static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
#define define_therm_throt_sysdev_show_func(name) \
-static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
- struct sysdev_attribute *attr, \
- char *buf) \
+ \
+static ssize_t therm_throt_sysdev_show_##name( \
+ struct sys_device *dev, \
+ struct sysdev_attribute *attr, \
+ char *buf) \
{ \
unsigned int cpu = dev->id; \
ssize_t ret; \
@@ -55,7 +68,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
preempt_disable(); /* CPU hotplug */ \
if (cpu_online(cpu)) \
ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_throttle_##name, cpu)); \
+ per_cpu(thermal_state, cpu).name); \
else \
ret = 0; \
preempt_enable(); \
@@ -63,11 +76,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
return ret; \
}
-define_therm_throt_sysdev_show_func(count);
-define_therm_throt_sysdev_one_ro(count);
+define_therm_throt_sysdev_show_func(throttle_count);
+define_therm_throt_sysdev_one_ro(throttle_count);
static struct attribute *thermal_throttle_attrs[] = {
- &attr_count.attr,
+ &attr_throttle_count.attr,
NULL
};
@@ -93,33 +106,39 @@ static struct attribute_group thermal_throttle_attr_group = {
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
-static int therm_throt_process(int curr)
+static int therm_throt_process(bool is_throttled)
{
- unsigned int cpu = smp_processor_id();
- __u64 tmp_jiffs = get_jiffies_64();
- bool was_throttled = __get_cpu_var(thermal_throttle_active);
- bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
+ struct thermal_state *state;
+ unsigned int this_cpu;
+ bool was_throttled;
+ u64 now;
+
+ this_cpu = smp_processor_id();
+ now = get_jiffies_64();
+ state = &per_cpu(thermal_state, this_cpu);
+
+ was_throttled = state->is_throttled;
+ state->is_throttled = is_throttled;
if (is_throttled)
- __get_cpu_var(thermal_throttle_count)++;
+ state->throttle_count++;
- if (!(was_throttled ^ is_throttled) &&
- time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+ if (time_before64(now, state->next_check) &&
+ state->throttle_count != state->last_throttle_count)
return 0;
- __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
+ state->next_check = now + CHECK_INTERVAL;
+ state->last_throttle_count = state->throttle_count;
/* if we just entered the thermal event */
if (is_throttled) {
- printk(KERN_CRIT "CPU%d: Temperature above threshold, "
- "cpu clock throttled (total events = %lu)\n",
- cpu, __get_cpu_var(thermal_throttle_count));
+ printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
add_taint(TAINT_MACHINE_CHECK);
return 1;
}
if (was_throttled) {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+ printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
return 1;
}
@@ -213,7 +232,7 @@ static void intel_thermal_interrupt(void)
__u64 msr_val;
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
+ if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
mce_log_therm_throt_event(msr_val);
}
@@ -237,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
ack_APIC_irq();
}
+void __init mcheck_intel_therm_init(void)
+{
+ /*
+ * This function is only called on boot CPU. Save the init thermal
+ * LVT value on BSP and use that value to restore APs' thermal LVT
+ * entry BIOS programmed later
+ */
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) &&
+ cpu_has(&boot_cpu_data, X86_FEATURE_ACC))
+ lvtthmr_init = apic_read(APIC_LVTTHMR);
+}
+
void intel_init_thermal(struct cpuinfo_x86 *c)
{
unsigned int cpu = smp_processor_id();
@@ -253,16 +284,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
* since it might be delivered via SMI already:
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- h = apic_read(APIC_LVTTHMR);
+
+ /*
+ * The initial value of thermal LVT entries on all APs always reads
+ * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
+ * sequence to them and LVT registers are reset to 0s except for
+ * the mask bits which are set to 1s when APs receive INIT IPI.
+ * Always restore the value that BIOS has programmed on AP based on
+ * BSP's info we saved since BIOS is always setting the same value
+ * for all threads/cores
+ */
+ apic_write(APIC_LVTTHMR, lvtthmr_init);
+
+ h = lvtthmr_init;
+
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG
"CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
- if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
- tm2 = 1;
-
/* Check whether a vector already exists */
if (h & APIC_VECTOR_MASK) {
printk(KERN_DEBUG
@@ -271,6 +312,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
+ /* early Pentium M models use different method for enabling TM2 */
+ if (cpu_has(c, X86_FEATURE_TM2)) {
+ if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
+ rdmsr(MSR_THERM2_CTL, l, h);
+ if (l & MSR_THERM2_CTL_TM_SELECT)
+ tm2 = 1;
+ } else if (l & MSR_IA32_MISC_ENABLE_TM2)
+ tm2 = 1;
+ }
+
/* We'll mask the thermal vector in the lapic till we're ready: */
h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
apic_write(APIC_LVTTHMR, h);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 315738c74aa..73c86db5acb 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -846,7 +846,7 @@ int __init mtrr_cleanup(unsigned address_bits)
sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
range_sums = sum_ranges(range, nr_range);
- printk(KERN_INFO "total RAM coverred: %ldM\n",
+ printk(KERN_INFO "total RAM covered: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 08b6ea4c62b..3c1b12d461d 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -96,17 +96,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
unsigned long long base, size;
char *ptr;
char line[LINE_SIZE];
+ int length;
size_t linelen;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!len)
- return -EINVAL;
memset(line, 0, LINE_SIZE);
- if (len > LINE_SIZE)
- len = LINE_SIZE;
- if (copy_from_user(line, buf, len - 1))
+
+ length = len;
+ length--;
+
+ if (length > LINE_SIZE - 1)
+ length = LINE_SIZE - 1;
+
+ if (length < 0)
+ return -EINVAL;
+
+ if (copy_from_user(line, buf, length))
return -EFAULT;
linelen = strlen(line);
@@ -126,8 +133,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
return -EINVAL;
base = simple_strtoull(line + 5, &ptr, 0);
- for (; isspace(*ptr); ++ptr)
- ;
+ while (isspace(*ptr))
+ ptr++;
if (strncmp(ptr, "size=", 5))
return -EINVAL;
@@ -135,14 +142,14 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
size = simple_strtoull(ptr + 5, &ptr, 0);
if ((base & 0xfff) || (size & 0xfff))
return -EINVAL;
- for (; isspace(*ptr); ++ptr)
- ;
+ while (isspace(*ptr))
+ ptr++;
if (strncmp(ptr, "type=", 5))
return -EINVAL;
ptr += 5;
- for (; isspace(*ptr); ++ptr)
- ;
+ while (isspace(*ptr))
+ ptr++;
for (i = 0; i < MTRR_NUM_TYPES; ++i) {
if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7af0f88a416..84e83de5457 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
+static bool mtrr_aps_delayed_init;
static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
@@ -163,7 +164,10 @@ static void ipi_handler(void *info)
if (data->smp_reg != ~0U) {
mtrr_if->set(data->smp_reg, data->smp_base,
data->smp_size, data->smp_type);
- } else {
+ } else if (mtrr_aps_delayed_init) {
+ /*
+ * Initialize the MTRRs inaddition to the synchronisation.
+ */
mtrr_if->set_all();
}
@@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
*/
if (reg != ~0U)
mtrr_if->set(reg, base, size, type);
+ else if (!mtrr_aps_delayed_init)
+ mtrr_if->set_all();
/* Wait for the others */
while (atomic_read(&data.count))
@@ -721,9 +727,7 @@ void __init mtrr_bp_init(void)
void mtrr_ap_init(void)
{
- unsigned long flags;
-
- if (!mtrr_if || !use_intel())
+ if (!use_intel() || mtrr_aps_delayed_init)
return;
/*
* Ideally we should hold mtrr_mutex here to avoid mtrr entries
@@ -738,11 +742,7 @@ void mtrr_ap_init(void)
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
* lock to prevent mtrr entry changes
*/
- local_irq_save(flags);
-
- mtrr_if->set_all();
-
- local_irq_restore(flags);
+ set_mtrr(~0U, 0, 0, 0);
}
/**
@@ -753,6 +753,34 @@ void mtrr_save_state(void)
smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
}
+void set_mtrr_aps_delayed_init(void)
+{
+ if (!use_intel())
+ return;
+
+ mtrr_aps_delayed_init = true;
+}
+
+/*
+ * MTRR initialization for all AP's
+ */
+void mtrr_aps_init(void)
+{
+ if (!use_intel())
+ return;
+
+ set_mtrr(~0U, 0, 0, 0);
+ mtrr_aps_delayed_init = false;
+}
+
+void mtrr_bp_restore(void)
+{
+ if (!use_intel())
+ return;
+
+ mtrr_if->set_all();
+}
+
static int __init mtrr_init_finialize(void)
{
if (!mtrr_if)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c
index f9cd0849bd4..c1bbed1021d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1,5 +1,5 @@
/*
- * Performance counter x86 architecture code
+ * Performance events x86 architecture code
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
@@ -11,7 +11,7 @@
* For licencing details see kernel-base/COPYING
*/
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
@@ -27,19 +27,19 @@
#include <asm/stacktrace.h>
#include <asm/nmi.h>
-static u64 perf_counter_mask __read_mostly;
+static u64 perf_event_mask __read_mostly;
-/* The maximal number of PEBS counters: */
-#define MAX_PEBS_COUNTERS 4
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS 4
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024)
+#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64)
+#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
/*
@@ -65,11 +65,11 @@ struct debug_store {
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
- u64 pebs_counter_reset[MAX_PEBS_COUNTERS];
+ u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
-struct cpu_hw_counters {
- struct perf_counter *counters[X86_PMC_IDX_MAX];
+struct cpu_hw_events {
+ struct perf_event *events[X86_PMC_IDX_MAX];
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long interrupts;
@@ -77,6 +77,18 @@ struct cpu_hw_counters {
struct debug_store *ds;
};
+struct event_constraint {
+ unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ int code;
+};
+
+#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
+#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
+
+#define for_each_event_constraint(e, c) \
+ for ((e) = (c); (e)->idxmsk[0]; (e)++)
+
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -86,30 +98,34 @@ struct x86_pmu {
int (*handle_irq)(struct pt_regs *);
void (*disable_all)(void);
void (*enable_all)(void);
- void (*enable)(struct hw_perf_counter *, int);
- void (*disable)(struct hw_perf_counter *, int);
+ void (*enable)(struct hw_perf_event *, int);
+ void (*disable)(struct hw_perf_event *, int);
unsigned eventsel;
unsigned perfctr;
u64 (*event_map)(int);
u64 (*raw_event)(u64);
int max_events;
- int num_counters;
- int num_counters_fixed;
- int counter_bits;
- u64 counter_mask;
+ int num_events;
+ int num_events_fixed;
+ int event_bits;
+ u64 event_mask;
int apic;
u64 max_period;
u64 intel_ctrl;
void (*enable_bts)(u64 config);
void (*disable_bts)(void);
+ int (*get_event_idx)(struct cpu_hw_events *cpuc,
+ struct hw_perf_event *hwc);
};
static struct x86_pmu x86_pmu __read_mostly;
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};
+static const struct event_constraint *event_constraints;
+
/*
* Not sure about some of these
*/
@@ -124,37 +140,47 @@ static const u64 p6_perfmon_event_map[] =
[PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
};
-static u64 p6_pmu_event_map(int event)
+static u64 p6_pmu_event_map(int hw_event)
{
- return p6_perfmon_event_map[event];
+ return p6_perfmon_event_map[hw_event];
}
/*
- * Counter setting that is specified not to count anything.
+ * Event setting that is specified not to count anything.
* We use this to effectively disable a counter.
*
* L2_RQSTS with 0 MESI unit mask.
*/
-#define P6_NOP_COUNTER 0x0000002EULL
+#define P6_NOP_EVENT 0x0000002EULL
-static u64 p6_pmu_raw_event(u64 event)
+static u64 p6_pmu_raw_event(u64 hw_event)
{
#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
#define P6_EVNTSEL_INV_MASK 0x00800000ULL
-#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
+#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
#define P6_EVNTSEL_MASK \
(P6_EVNTSEL_EVENT_MASK | \
P6_EVNTSEL_UNIT_MASK | \
P6_EVNTSEL_EDGE_MASK | \
P6_EVNTSEL_INV_MASK | \
- P6_EVNTSEL_COUNTER_MASK)
+ P6_EVNTSEL_REG_MASK)
- return event & P6_EVNTSEL_MASK;
+ return hw_event & P6_EVNTSEL_MASK;
}
+static const struct event_constraint intel_p6_event_constraints[] =
+{
+ EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
+ EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
+ EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT_END
+};
/*
* Intel PerfMon v3. Used on Core2 and later.
@@ -170,16 +196,45 @@ static const u64 intel_perfmon_event_map[] =
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
};
-static u64 intel_pmu_event_map(int event)
+static const struct event_constraint intel_core_event_constraints[] =
+{
+ EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+ EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+ EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+ EVENT_CONSTRAINT_END
+};
+
+static const struct event_constraint intel_nehalem_event_constraints[] =
+{
+ EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+ EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+ EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+ EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+ EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+ EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
+ EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
+ EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
+ EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
+ EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
{
- return intel_perfmon_event_map[event];
+ return intel_perfmon_event_map[hw_event];
}
/*
- * Generalized hw caching related event table, filled
+ * Generalized hw caching related hw_event table, filled
* in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'event makes no sense on
- * this CPU', any other value means the raw event
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
* ID.
*/
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
-static const u64 nehalem_hw_cache_event_ids
+static __initconst u64 nehalem_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
},
};
-static const u64 core2_hw_cache_event_ids
+static __initconst u64 core2_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
},
};
-static const u64 atom_hw_cache_event_ids
+static __initconst u64 atom_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -463,25 +518,25 @@ static const u64 atom_hw_cache_event_ids
},
};
-static u64 intel_pmu_raw_event(u64 event)
+static u64 intel_pmu_raw_event(u64 hw_event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
-#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
+#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
#define CORE_EVNTSEL_MASK \
(CORE_EVNTSEL_EVENT_MASK | \
CORE_EVNTSEL_UNIT_MASK | \
CORE_EVNTSEL_EDGE_MASK | \
CORE_EVNTSEL_INV_MASK | \
- CORE_EVNTSEL_COUNTER_MASK)
+ CORE_EVNTSEL_REG_MASK)
- return event & CORE_EVNTSEL_MASK;
+ return hw_event & CORE_EVNTSEL_MASK;
}
-static const u64 amd_hw_cache_event_ids
+static __initconst u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -585,39 +640,39 @@ static const u64 amd_perfmon_event_map[] =
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
};
-static u64 amd_pmu_event_map(int event)
+static u64 amd_pmu_event_map(int hw_event)
{
- return amd_perfmon_event_map[event];
+ return amd_perfmon_event_map[hw_event];
}
-static u64 amd_pmu_raw_event(u64 event)
+static u64 amd_pmu_raw_event(u64 hw_event)
{
#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
#define K7_EVNTSEL_INV_MASK 0x000800000ULL
-#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
+#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
#define K7_EVNTSEL_MASK \
(K7_EVNTSEL_EVENT_MASK | \
K7_EVNTSEL_UNIT_MASK | \
K7_EVNTSEL_EDGE_MASK | \
K7_EVNTSEL_INV_MASK | \
- K7_EVNTSEL_COUNTER_MASK)
+ K7_EVNTSEL_REG_MASK)
- return event & K7_EVNTSEL_MASK;
+ return hw_event & K7_EVNTSEL_MASK;
}
/*
- * Propagate counter elapsed time into the generic counter.
- * Can only be executed on the CPU where the counter is active.
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
* Returns the delta events processed.
*/
static u64
-x86_perf_counter_update(struct perf_counter *counter,
- struct hw_perf_counter *hwc, int idx)
+x86_perf_event_update(struct perf_event *event,
+ struct hw_perf_event *hwc, int idx)
{
- int shift = 64 - x86_pmu.counter_bits;
+ int shift = 64 - x86_pmu.event_bits;
u64 prev_raw_count, new_raw_count;
s64 delta;
@@ -625,15 +680,15 @@ x86_perf_counter_update(struct perf_counter *counter,
return 0;
/*
- * Careful: an NMI might modify the previous counter value.
+ * Careful: an NMI might modify the previous event value.
*
* Our tactic to handle this is to first atomically read and
* exchange a new raw count - then add that new-prev delta
- * count to the generic counter atomically:
+ * count to the generic event atomically:
*/
again:
prev_raw_count = atomic64_read(&hwc->prev_count);
- rdmsrl(hwc->counter_base + idx, new_raw_count);
+ rdmsrl(hwc->event_base + idx, new_raw_count);
if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
@@ -642,7 +697,7 @@ again:
/*
* Now we have the new raw value and have updated the prev
* timestamp already. We can now calculate the elapsed delta
- * (counter-)time and add that to the generic counter.
+ * (event-)time and add that to the generic event.
*
* Careful, not all hw sign-extends above the physical width
* of the count.
@@ -650,13 +705,13 @@ again:
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
- atomic64_add(delta, &counter->count);
+ atomic64_add(delta, &event->count);
atomic64_sub(delta, &hwc->period_left);
return new_raw_count;
}
-static atomic_t active_counters;
+static atomic_t active_events;
static DEFINE_MUTEX(pmc_reserve_mutex);
static bool reserve_pmc_hardware(void)
@@ -667,12 +722,12 @@ static bool reserve_pmc_hardware(void)
if (nmi_watchdog == NMI_LOCAL_APIC)
disable_lapic_nmi_watchdog();
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < x86_pmu.num_events; i++) {
if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
goto perfctr_fail;
}
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < x86_pmu.num_events; i++) {
if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
goto eventsel_fail;
}
@@ -685,7 +740,7 @@ eventsel_fail:
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu.eventsel + i);
- i = x86_pmu.num_counters;
+ i = x86_pmu.num_events;
perfctr_fail:
for (i--; i >= 0; i--)
@@ -703,7 +758,7 @@ static void release_pmc_hardware(void)
#ifdef CONFIG_X86_LOCAL_APIC
int i;
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < x86_pmu.num_events; i++) {
release_perfctr_nmi(x86_pmu.perfctr + i);
release_evntsel_nmi(x86_pmu.eventsel + i);
}
@@ -720,7 +775,7 @@ static inline bool bts_available(void)
static inline void init_debug_store_on_cpu(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
if (!ds)
return;
@@ -732,7 +787,7 @@ static inline void init_debug_store_on_cpu(int cpu)
static inline void fini_debug_store_on_cpu(int cpu)
{
- if (!per_cpu(cpu_hw_counters, cpu).ds)
+ if (!per_cpu(cpu_hw_events, cpu).ds)
return;
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
@@ -751,12 +806,12 @@ static void release_bts_hardware(void)
fini_debug_store_on_cpu(cpu);
for_each_possible_cpu(cpu) {
- struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
if (!ds)
continue;
- per_cpu(cpu_hw_counters, cpu).ds = NULL;
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
kfree((void *)(unsigned long)ds->bts_buffer_base);
kfree(ds);
@@ -796,7 +851,7 @@ static int reserve_bts_hardware(void)
ds->bts_interrupt_threshold =
ds->bts_absolute_maximum - BTS_OVFL_TH;
- per_cpu(cpu_hw_counters, cpu).ds = ds;
+ per_cpu(cpu_hw_events, cpu).ds = ds;
err = 0;
}
@@ -812,9 +867,9 @@ static int reserve_bts_hardware(void)
return err;
}
-static void hw_perf_counter_destroy(struct perf_counter *counter)
+static void hw_perf_event_destroy(struct perf_event *event)
{
- if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
+ if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
release_pmc_hardware();
release_bts_hardware();
mutex_unlock(&pmc_reserve_mutex);
@@ -827,7 +882,7 @@ static inline int x86_pmu_initialized(void)
}
static inline int
-set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
{
unsigned int cache_type, cache_op, cache_result;
u64 config, val;
@@ -880,7 +935,7 @@ static void intel_pmu_enable_bts(u64 config)
static void intel_pmu_disable_bts(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
unsigned long debugctlmsr;
if (!cpuc->ds)
@@ -898,10 +953,10 @@ static void intel_pmu_disable_bts(void)
/*
* Setup the hardware configuration for a given attr_type
*/
-static int __hw_perf_counter_init(struct perf_counter *counter)
+static int __hw_perf_event_init(struct perf_event *event)
{
- struct perf_counter_attr *attr = &counter->attr;
- struct hw_perf_counter *hwc = &counter->hw;
+ struct perf_event_attr *attr = &event->attr;
+ struct hw_perf_event *hwc = &event->hw;
u64 config;
int err;
@@ -909,27 +964,31 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
return -ENODEV;
err = 0;
- if (!atomic_inc_not_zero(&active_counters)) {
+ if (!atomic_inc_not_zero(&active_events)) {
mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_counters) == 0) {
+ if (atomic_read(&active_events) == 0) {
if (!reserve_pmc_hardware())
err = -EBUSY;
else
err = reserve_bts_hardware();
}
if (!err)
- atomic_inc(&active_counters);
+ atomic_inc(&active_events);
mutex_unlock(&pmc_reserve_mutex);
}
if (err)
return err;
+ event->destroy = hw_perf_event_destroy;
+
/*
* Generate PMC IRQs:
* (keep 'enabled' bit clear for now)
*/
hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+ hwc->idx = -1;
+
/*
* Count user and OS events unless requested not to.
*/
@@ -946,17 +1005,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
/*
* If we have a PMU initialized but no APIC
* interrupts, we cannot sample hardware
- * counters (user-space has to fall back and
- * sample via a hrtimer based software counter):
+ * events (user-space has to fall back and
+ * sample via a hrtimer based software event):
*/
if (!x86_pmu.apic)
return -EOPNOTSUPP;
}
- counter->destroy = hw_perf_counter_destroy;
-
/*
- * Raw event type provide the config in the event structure
+ * Raw hw_event type provide the config in the hw_event structure
*/
if (attr->type == PERF_TYPE_RAW) {
hwc->config |= x86_pmu.raw_event(attr->config);
@@ -1001,7 +1058,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
static void p6_pmu_disable_all(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
u64 val;
if (!cpuc->enabled)
@@ -1018,7 +1075,7 @@ static void p6_pmu_disable_all(void)
static void intel_pmu_disable_all(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
if (!cpuc->enabled)
return;
@@ -1034,7 +1091,7 @@ static void intel_pmu_disable_all(void)
static void amd_pmu_disable_all(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx;
if (!cpuc->enabled)
@@ -1043,12 +1100,12 @@ static void amd_pmu_disable_all(void)
cpuc->enabled = 0;
/*
* ensure we write the disable before we start disabling the
- * counters proper, so that amd_pmu_enable_counter() does the
+ * events proper, so that amd_pmu_enable_event() does the
* right thing.
*/
barrier();
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
u64 val;
if (!test_bit(idx, cpuc->active_mask))
@@ -1070,7 +1127,7 @@ void hw_perf_disable(void)
static void p6_pmu_enable_all(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
unsigned long val;
if (cpuc->enabled)
@@ -1087,7 +1144,7 @@ static void p6_pmu_enable_all(void)
static void intel_pmu_enable_all(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
if (cpuc->enabled)
return;
@@ -1098,19 +1155,19 @@ static void intel_pmu_enable_all(void)
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- struct perf_counter *counter =
- cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+ struct perf_event *event =
+ cpuc->events[X86_PMC_IDX_FIXED_BTS];
- if (WARN_ON_ONCE(!counter))
+ if (WARN_ON_ONCE(!event))
return;
- intel_pmu_enable_bts(counter->hw.config);
+ intel_pmu_enable_bts(event->hw.config);
}
}
static void amd_pmu_enable_all(void)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx;
if (cpuc->enabled)
@@ -1119,14 +1176,14 @@ static void amd_pmu_enable_all(void)
cpuc->enabled = 1;
barrier();
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct perf_counter *counter = cpuc->counters[idx];
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ struct perf_event *event = cpuc->events[idx];
u64 val;
if (!test_bit(idx, cpuc->active_mask))
continue;
- val = counter->hw.config;
+ val = event->hw.config;
val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
}
@@ -1153,19 +1210,19 @@ static inline void intel_pmu_ack_status(u64 ack)
wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
}
-static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
{
(void)checking_wrmsrl(hwc->config_base + idx,
hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
}
-static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
{
(void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
}
static inline void
-intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
+intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
{
int idx = __idx - X86_PMC_IDX_FIXED;
u64 ctrl_val, mask;
@@ -1178,10 +1235,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
}
static inline void
-p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- u64 val = P6_NOP_COUNTER;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ u64 val = P6_NOP_EVENT;
if (cpuc->enabled)
val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -1190,7 +1247,7 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
}
static inline void
-intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
{
if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
intel_pmu_disable_bts();
@@ -1202,24 +1259,24 @@ intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
return;
}
- x86_pmu_disable_counter(hwc, idx);
+ x86_pmu_disable_event(hwc, idx);
}
static inline void
-amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
{
- x86_pmu_disable_counter(hwc, idx);
+ x86_pmu_disable_event(hwc, idx);
}
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
/*
* Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the counter disabled in hw:
+ * To be called with the event disabled in hw:
*/
static int
-x86_perf_counter_set_period(struct perf_counter *counter,
- struct hw_perf_counter *hwc, int idx)
+x86_perf_event_set_period(struct perf_event *event,
+ struct hw_perf_event *hwc, int idx)
{
s64 left = atomic64_read(&hwc->period_left);
s64 period = hwc->sample_period;
@@ -1245,7 +1302,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
ret = 1;
}
/*
- * Quirk: certain CPUs dont like it if just 1 event is left:
+ * Quirk: certain CPUs dont like it if just 1 hw_event is left:
*/
if (unlikely(left < 2))
left = 2;
@@ -1253,24 +1310,24 @@ x86_perf_counter_set_period(struct perf_counter *counter,
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
- per_cpu(prev_left[idx], smp_processor_id()) = left;
+ per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
- * The hw counter starts counting from this counter offset,
+ * The hw event starts counting from this event offset,
* mark it to be able to extra future deltas:
*/
atomic64_set(&hwc->prev_count, (u64)-left);
- err = checking_wrmsrl(hwc->counter_base + idx,
- (u64)(-left) & x86_pmu.counter_mask);
+ err = checking_wrmsrl(hwc->event_base + idx,
+ (u64)(-left) & x86_pmu.event_mask);
- perf_counter_update_userpage(counter);
+ perf_event_update_userpage(event);
return ret;
}
static inline void
-intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
+intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
{
int idx = __idx - X86_PMC_IDX_FIXED;
u64 ctrl_val, bits, mask;
@@ -1295,9 +1352,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
err = checking_wrmsrl(hwc->config_base, ctrl_val);
}
-static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
u64 val;
val = hwc->config;
@@ -1308,10 +1365,10 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
}
-static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
{
if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
- if (!__get_cpu_var(cpu_hw_counters).enabled)
+ if (!__get_cpu_var(cpu_hw_events).enabled)
return;
intel_pmu_enable_bts(hwc->config);
@@ -1323,134 +1380,189 @@ static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
return;
}
- x86_pmu_enable_counter(hwc, idx);
+ x86_pmu_enable_event(hwc, idx);
}
-static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
if (cpuc->enabled)
- x86_pmu_enable_counter(hwc, idx);
+ x86_pmu_enable_event(hwc, idx);
}
-static int
-fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
+static int fixed_mode_idx(struct hw_perf_event *hwc)
{
- unsigned int event;
+ unsigned int hw_event;
- event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+ hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
- if (unlikely((event ==
+ if (unlikely((hw_event ==
x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
(hwc->sample_period == 1)))
return X86_PMC_IDX_FIXED_BTS;
- if (!x86_pmu.num_counters_fixed)
+ if (!x86_pmu.num_events_fixed)
return -1;
- if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+ /*
+ * fixed counters do not take all possible filters
+ */
+ if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
+ return -1;
+
+ if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
return X86_PMC_IDX_FIXED_INSTRUCTIONS;
- if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+ if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
return X86_PMC_IDX_FIXED_CPU_CYCLES;
- if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+ if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
return X86_PMC_IDX_FIXED_BUS_CYCLES;
return -1;
}
/*
- * Find a PMC slot for the freshly enabled / scheduled in counter:
+ * generic counter allocator: get next free counter
*/
-static int x86_pmu_enable(struct perf_counter *counter)
+static int
+gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- struct hw_perf_counter *hwc = &counter->hw;
int idx;
- idx = fixed_mode_idx(counter, hwc);
+ idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
+ return idx == x86_pmu.num_events ? -1 : idx;
+}
+
+/*
+ * intel-specific counter allocator: check event constraints
+ */
+static int
+intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
+{
+ const struct event_constraint *event_constraint;
+ int i, code;
+
+ if (!event_constraints)
+ goto skip;
+
+ code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
+
+ for_each_event_constraint(event_constraint, event_constraints) {
+ if (code == event_constraint->code) {
+ for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
+ if (!test_and_set_bit(i, cpuc->used_mask))
+ return i;
+ }
+ return -1;
+ }
+ }
+skip:
+ return gen_get_event_idx(cpuc, hwc);
+}
+
+static int
+x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
+{
+ int idx;
+
+ idx = fixed_mode_idx(hwc);
if (idx == X86_PMC_IDX_FIXED_BTS) {
/* BTS is already occupied. */
if (test_and_set_bit(idx, cpuc->used_mask))
return -EAGAIN;
hwc->config_base = 0;
- hwc->counter_base = 0;
+ hwc->event_base = 0;
hwc->idx = idx;
} else if (idx >= 0) {
/*
- * Try to get the fixed counter, if that is already taken
- * then try to get a generic counter:
+ * Try to get the fixed event, if that is already taken
+ * then try to get a generic event:
*/
if (test_and_set_bit(idx, cpuc->used_mask))
goto try_generic;
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
/*
- * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+ * We set it so that event_base + idx in wrmsr/rdmsr maps to
* MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
*/
- hwc->counter_base =
+ hwc->event_base =
MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
hwc->idx = idx;
} else {
idx = hwc->idx;
- /* Try to get the previous generic counter again */
- if (test_and_set_bit(idx, cpuc->used_mask)) {
+ /* Try to get the previous generic event again */
+ if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
try_generic:
- idx = find_first_zero_bit(cpuc->used_mask,
- x86_pmu.num_counters);
- if (idx == x86_pmu.num_counters)
+ idx = x86_pmu.get_event_idx(cpuc, hwc);
+ if (idx == -1)
return -EAGAIN;
set_bit(idx, cpuc->used_mask);
hwc->idx = idx;
}
- hwc->config_base = x86_pmu.eventsel;
- hwc->counter_base = x86_pmu.perfctr;
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->event_base = x86_pmu.perfctr;
}
- perf_counters_lapic_init();
+ return idx;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in event:
+ */
+static int x86_pmu_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+
+ idx = x86_schedule_event(cpuc, hwc);
+ if (idx < 0)
+ return idx;
+
+ perf_events_lapic_init();
x86_pmu.disable(hwc, idx);
- cpuc->counters[idx] = counter;
+ cpuc->events[idx] = event;
set_bit(idx, cpuc->active_mask);
- x86_perf_counter_set_period(counter, hwc, idx);
+ x86_perf_event_set_period(event, hwc, idx);
x86_pmu.enable(hwc, idx);
- perf_counter_update_userpage(counter);
+ perf_event_update_userpage(event);
return 0;
}
-static void x86_pmu_unthrottle(struct perf_counter *counter)
+static void x86_pmu_unthrottle(struct perf_event *event)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- struct hw_perf_counter *hwc = &counter->hw;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
- cpuc->counters[hwc->idx] != counter))
+ cpuc->events[hwc->idx] != event))
return;
x86_pmu.enable(hwc, hwc->idx);
}
-void perf_counter_print_debug(void)
+void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
- struct cpu_hw_counters *cpuc;
+ struct cpu_hw_events *cpuc;
unsigned long flags;
int cpu, idx;
- if (!x86_pmu.num_counters)
+ if (!x86_pmu.num_events)
return;
local_irq_save(flags);
cpu = smp_processor_id();
- cpuc = &per_cpu(cpu_hw_counters, cpu);
+ cpuc = &per_cpu(cpu_hw_events, cpu);
if (x86_pmu.version >= 2) {
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
@@ -1466,11 +1578,11 @@ void perf_counter_print_debug(void)
}
pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
- prev_left = per_cpu(prev_left[idx], cpu);
+ prev_left = per_cpu(pmc_prev_left[idx], cpu);
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
@@ -1479,7 +1591,7 @@ void perf_counter_print_debug(void)
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
cpu, idx, prev_left);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1488,8 +1600,7 @@ void perf_counter_print_debug(void)
local_irq_restore(flags);
}
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
- struct perf_sample_data *data)
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
{
struct debug_store *ds = cpuc->ds;
struct bts_record {
@@ -1497,11 +1608,14 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
u64 to;
u64 flags;
};
- struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
- unsigned long orig_ip = data->regs->ip;
+ struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
struct bts_record *at, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ struct pt_regs regs;
- if (!counter)
+ if (!event)
return;
if (!ds)
@@ -1510,26 +1624,45 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
top = (struct bts_record *)(unsigned long)ds->bts_index;
+ if (top <= at)
+ return;
+
ds->bts_index = ds->bts_buffer_base;
+
+ data.period = event->hw.last_period;
+ data.addr = 0;
+ regs.ip = 0;
+
+ /*
+ * Prepare a generic sample, i.e. fill in the invariant fields.
+ * We will overwrite the from and to address before we output
+ * the sample.
+ */
+ perf_prepare_sample(&header, &data, event, &regs);
+
+ if (perf_output_begin(&handle, event,
+ header.size * (top - at), 1, 1))
+ return;
+
for (; at < top; at++) {
- data->regs->ip = at->from;
- data->addr = at->to;
+ data.ip = at->from;
+ data.addr = at->to;
- perf_counter_output(counter, 1, data);
+ perf_output_sample(&handle, &header, &data, event);
}
- data->regs->ip = orig_ip;
- data->addr = 0;
+ perf_output_end(&handle);
/* There's new data available. */
- counter->pending_kill = POLL_IN;
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
}
-static void x86_pmu_disable(struct perf_counter *counter)
+static void x86_pmu_disable(struct perf_event *event)
{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- struct hw_perf_counter *hwc = &counter->hw;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
/*
@@ -1541,67 +1674,63 @@ static void x86_pmu_disable(struct perf_counter *counter)
/*
* Make sure the cleared pointer becomes visible before we
- * (potentially) free the counter:
+ * (potentially) free the event:
*/
barrier();
/*
- * Drain the remaining delta count out of a counter
+ * Drain the remaining delta count out of a event
* that we are disabling:
*/
- x86_perf_counter_update(counter, hwc, idx);
+ x86_perf_event_update(event, hwc, idx);
/* Drain the remaining BTS records. */
- if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
- struct perf_sample_data data;
- struct pt_regs regs;
+ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+ intel_pmu_drain_bts_buffer(cpuc);
- data.regs = &regs;
- intel_pmu_drain_bts_buffer(cpuc, &data);
- }
- cpuc->counters[idx] = NULL;
+ cpuc->events[idx] = NULL;
clear_bit(idx, cpuc->used_mask);
- perf_counter_update_userpage(counter);
+ perf_event_update_userpage(event);
}
/*
- * Save and restart an expired counter. Called by NMI contexts,
- * so it has to be careful about preempting normal counter ops:
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
*/
-static int intel_pmu_save_and_restart(struct perf_counter *counter)
+static int intel_pmu_save_and_restart(struct perf_event *event)
{
- struct hw_perf_counter *hwc = &counter->hw;
+ struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
int ret;
- x86_perf_counter_update(counter, hwc, idx);
- ret = x86_perf_counter_set_period(counter, hwc, idx);
+ x86_perf_event_update(event, hwc, idx);
+ ret = x86_perf_event_set_period(event, hwc, idx);
- if (counter->state == PERF_COUNTER_STATE_ACTIVE)
- intel_pmu_enable_counter(hwc, idx);
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ intel_pmu_enable_event(hwc, idx);
return ret;
}
static void intel_pmu_reset(void)
{
- struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
+ struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
unsigned long flags;
int idx;
- if (!x86_pmu.num_counters)
+ if (!x86_pmu.num_events)
return;
local_irq_save(flags);
printk("clearing PMU state on CPU#%d\n", smp_processor_id());
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
}
if (ds)
@@ -1613,39 +1742,38 @@ static void intel_pmu_reset(void)
static int p6_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
- struct cpu_hw_counters *cpuc;
- struct perf_counter *counter;
- struct hw_perf_counter *hwc;
+ struct cpu_hw_events *cpuc;
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
int idx, handled = 0;
u64 val;
- data.regs = regs;
data.addr = 0;
- cpuc = &__get_cpu_var(cpu_hw_counters);
+ cpuc = &__get_cpu_var(cpu_hw_events);
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
if (!test_bit(idx, cpuc->active_mask))
continue;
- counter = cpuc->counters[idx];
- hwc = &counter->hw;
+ event = cpuc->events[idx];
+ hwc = &event->hw;
- val = x86_perf_counter_update(counter, hwc, idx);
- if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+ val = x86_perf_event_update(event, hwc, idx);
+ if (val & (1ULL << (x86_pmu.event_bits - 1)))
continue;
/*
- * counter overflow
+ * event overflow
*/
handled = 1;
- data.period = counter->hw.last_period;
+ data.period = event->hw.last_period;
- if (!x86_perf_counter_set_period(counter, hwc, idx))
+ if (!x86_perf_event_set_period(event, hwc, idx))
continue;
- if (perf_counter_overflow(counter, 1, &data))
- p6_pmu_disable_counter(hwc, idx);
+ if (perf_event_overflow(event, 1, &data, regs))
+ p6_pmu_disable_event(hwc, idx);
}
if (handled)
@@ -1661,17 +1789,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
static int intel_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
- struct cpu_hw_counters *cpuc;
+ struct cpu_hw_events *cpuc;
int bit, loops;
u64 ack, status;
- data.regs = regs;
data.addr = 0;
- cpuc = &__get_cpu_var(cpu_hw_counters);
+ cpuc = &__get_cpu_var(cpu_hw_events);
perf_disable();
- intel_pmu_drain_bts_buffer(cpuc, &data);
+ intel_pmu_drain_bts_buffer(cpuc);
status = intel_pmu_get_status();
if (!status) {
perf_enable();
@@ -1681,8 +1808,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
loops = 0;
again:
if (++loops > 100) {
- WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
- perf_counter_print_debug();
+ WARN_ONCE(1, "perfevents: irq loop stuck!\n");
+ perf_event_print_debug();
intel_pmu_reset();
perf_enable();
return 1;
@@ -1691,19 +1818,19 @@ again:
inc_irq_stat(apic_perf_irqs);
ack = status;
for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
- struct perf_counter *counter = cpuc->counters[bit];
+ struct perf_event *event = cpuc->events[bit];
clear_bit(bit, (unsigned long *) &status);
if (!test_bit(bit, cpuc->active_mask))
continue;
- if (!intel_pmu_save_and_restart(counter))
+ if (!intel_pmu_save_and_restart(event))
continue;
- data.period = counter->hw.last_period;
+ data.period = event->hw.last_period;
- if (perf_counter_overflow(counter, 1, &data))
- intel_pmu_disable_counter(&counter->hw, bit);
+ if (perf_event_overflow(event, 1, &data, regs))
+ intel_pmu_disable_event(&event->hw, bit);
}
intel_pmu_ack_status(ack);
@@ -1723,39 +1850,38 @@ again:
static int amd_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
- struct cpu_hw_counters *cpuc;
- struct perf_counter *counter;
- struct hw_perf_counter *hwc;
+ struct cpu_hw_events *cpuc;
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
int idx, handled = 0;
u64 val;
- data.regs = regs;
data.addr = 0;
- cpuc = &__get_cpu_var(cpu_hw_counters);
+ cpuc = &__get_cpu_var(cpu_hw_events);
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
if (!test_bit(idx, cpuc->active_mask))
continue;
- counter = cpuc->counters[idx];
- hwc = &counter->hw;
+ event = cpuc->events[idx];
+ hwc = &event->hw;
- val = x86_perf_counter_update(counter, hwc, idx);
- if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+ val = x86_perf_event_update(event, hwc, idx);
+ if (val & (1ULL << (x86_pmu.event_bits - 1)))
continue;
/*
- * counter overflow
+ * event overflow
*/
handled = 1;
- data.period = counter->hw.last_period;
+ data.period = event->hw.last_period;
- if (!x86_perf_counter_set_period(counter, hwc, idx))
+ if (!x86_perf_event_set_period(event, hwc, idx))
continue;
- if (perf_counter_overflow(counter, 1, &data))
- amd_pmu_disable_counter(hwc, idx);
+ if (perf_event_overflow(event, 1, &data, regs))
+ amd_pmu_disable_event(hwc, idx);
}
if (handled)
@@ -1769,18 +1895,21 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
irq_enter();
ack_APIC_irq();
inc_irq_stat(apic_pending_irqs);
- perf_counter_do_pending();
+ perf_event_do_pending();
irq_exit();
}
-void set_perf_counter_pending(void)
+void set_perf_event_pending(void)
{
#ifdef CONFIG_X86_LOCAL_APIC
+ if (!x86_pmu.apic || !x86_pmu_initialized())
+ return;
+
apic->send_IPI_self(LOCAL_PENDING_VECTOR);
#endif
}
-void perf_counters_lapic_init(void)
+void perf_events_lapic_init(void)
{
#ifdef CONFIG_X86_LOCAL_APIC
if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1794,13 +1923,13 @@ void perf_counters_lapic_init(void)
}
static int __kprobes
-perf_counter_nmi_handler(struct notifier_block *self,
+perf_event_nmi_handler(struct notifier_block *self,
unsigned long cmd, void *__args)
{
struct die_args *args = __args;
struct pt_regs *regs;
- if (!atomic_read(&active_counters))
+ if (!atomic_read(&active_events))
return NOTIFY_DONE;
switch (cmd) {
@@ -1819,7 +1948,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
#endif
/*
* Can't rely on the handled return value to say it was our NMI, two
- * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+ * events could trigger 'simultaneously' raising two back-to-back NMIs.
*
* If the first NMI handles both, the latter will be empty and daze
* the CPU.
@@ -1829,19 +1958,19 @@ perf_counter_nmi_handler(struct notifier_block *self,
return NOTIFY_STOP;
}
-static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
- .notifier_call = perf_counter_nmi_handler,
+static __read_mostly struct notifier_block perf_event_nmi_notifier = {
+ .notifier_call = perf_event_nmi_handler,
.next = NULL,
.priority = 1
};
-static struct x86_pmu p6_pmu = {
+static __initconst struct x86_pmu p6_pmu = {
.name = "p6",
.handle_irq = p6_pmu_handle_irq,
.disable_all = p6_pmu_disable_all,
.enable_all = p6_pmu_enable_all,
- .enable = p6_pmu_enable_counter,
- .disable = p6_pmu_disable_counter,
+ .enable = p6_pmu_enable_event,
+ .disable = p6_pmu_disable_event,
.eventsel = MSR_P6_EVNTSEL0,
.perfctr = MSR_P6_PERFCTR0,
.event_map = p6_pmu_event_map,
@@ -1850,25 +1979,26 @@ static struct x86_pmu p6_pmu = {
.apic = 1,
.max_period = (1ULL << 31) - 1,
.version = 0,
- .num_counters = 2,
+ .num_events = 2,
/*
- * Counters have 40 bits implemented. However they are designed such
+ * Events have 40 bits implemented. However they are designed such
* that bits [32-39] are sign extensions of bit 31. As such the
- * effective width of a counter for P6-like PMU is 32 bits only.
+ * effective width of a event for P6-like PMU is 32 bits only.
*
* See IA-32 Intel Architecture Software developer manual Vol 3B
*/
- .counter_bits = 32,
- .counter_mask = (1ULL << 32) - 1,
+ .event_bits = 32,
+ .event_mask = (1ULL << 32) - 1,
+ .get_event_idx = intel_get_event_idx,
};
-static struct x86_pmu intel_pmu = {
+static __initconst struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
.disable_all = intel_pmu_disable_all,
.enable_all = intel_pmu_enable_all,
- .enable = intel_pmu_enable_counter,
- .disable = intel_pmu_disable_counter,
+ .enable = intel_pmu_enable_event,
+ .disable = intel_pmu_disable_event,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
@@ -1878,34 +2008,36 @@ static struct x86_pmu intel_pmu = {
/*
* Intel PMCs cannot be accessed sanely above 32 bit width,
* so we install an artificial 1<<31 period regardless of
- * the generic counter period:
+ * the generic event period:
*/
.max_period = (1ULL << 31) - 1,
.enable_bts = intel_pmu_enable_bts,
.disable_bts = intel_pmu_disable_bts,
+ .get_event_idx = intel_get_event_idx,
};
-static struct x86_pmu amd_pmu = {
+static __initconst struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = amd_pmu_handle_irq,
.disable_all = amd_pmu_disable_all,
.enable_all = amd_pmu_enable_all,
- .enable = amd_pmu_enable_counter,
- .disable = amd_pmu_disable_counter,
+ .enable = amd_pmu_enable_event,
+ .disable = amd_pmu_disable_event,
.eventsel = MSR_K7_EVNTSEL0,
.perfctr = MSR_K7_PERFCTR0,
.event_map = amd_pmu_event_map,
.raw_event = amd_pmu_raw_event,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = 4,
- .counter_bits = 48,
- .counter_mask = (1ULL << 48) - 1,
+ .num_events = 4,
+ .event_bits = 48,
+ .event_mask = (1ULL << 48) - 1,
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
+ .get_event_idx = gen_get_event_idx,
};
-static int p6_pmu_init(void)
+static __init int p6_pmu_init(void)
{
switch (boot_cpu_data.x86_model) {
case 1:
@@ -1915,10 +2047,12 @@ static int p6_pmu_init(void)
case 7:
case 8:
case 11: /* Pentium III */
+ event_constraints = intel_p6_event_constraints;
break;
case 9:
case 13:
/* Pentium M */
+ event_constraints = intel_p6_event_constraints;
break;
default:
pr_cont("unsupported p6 CPU model %d ",
@@ -1937,7 +2071,7 @@ static int p6_pmu_init(void)
return 0;
}
-static int intel_pmu_init(void)
+static __init int intel_pmu_init(void)
{
union cpuid10_edx edx;
union cpuid10_eax eax;
@@ -1956,7 +2090,7 @@ static int intel_pmu_init(void)
/*
* Check whether the Architectural PerfMon supports
- * Branch Misses Retired Event or not.
+ * Branch Misses Retired hw_event or not.
*/
cpuid(10, &eax.full, &ebx, &unused, &edx.full);
if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
@@ -1968,15 +2102,15 @@ static int intel_pmu_init(void)
x86_pmu = intel_pmu;
x86_pmu.version = version;
- x86_pmu.num_counters = eax.split.num_counters;
- x86_pmu.counter_bits = eax.split.bit_width;
- x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+ x86_pmu.num_events = eax.split.num_events;
+ x86_pmu.event_bits = eax.split.bit_width;
+ x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
/*
- * Quirk: v2 perfmon does not report fixed-purpose counters, so
- * assume at least 3 counters:
+ * Quirk: v2 perfmon does not report fixed-purpose events, so
+ * assume at least 3 events:
*/
- x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+ x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
/*
* Install the hw-cache-events table:
@@ -1990,12 +2124,14 @@ static int intel_pmu_init(void)
sizeof(hw_cache_event_ids));
pr_cont("Core2 events, ");
+ event_constraints = intel_core_event_constraints;
break;
default:
case 26:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ event_constraints = intel_nehalem_event_constraints;
pr_cont("Nehalem/Corei7 events, ");
break;
case 28:
@@ -2008,7 +2144,7 @@ static int intel_pmu_init(void)
return 0;
}
-static int amd_pmu_init(void)
+static __init int amd_pmu_init(void)
{
/* Performance-monitoring supported from K7 and later: */
if (boot_cpu_data.x86 < 6)
@@ -2023,11 +2159,11 @@ static int amd_pmu_init(void)
return 0;
}
-void __init init_hw_perf_counters(void)
+void __init init_hw_perf_events(void)
{
int err;
- pr_info("Performance Counters: ");
+ pr_info("Performance Events: ");
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
@@ -2040,45 +2176,45 @@ void __init init_hw_perf_counters(void)
return;
}
if (err != 0) {
- pr_cont("no PMU driver, software counters only.\n");
+ pr_cont("no PMU driver, software events only.\n");
return;
}
pr_cont("%s PMU driver.\n", x86_pmu.name);
- if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
- WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
- x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
- x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
+ if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+ WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+ x86_pmu.num_events, X86_PMC_MAX_GENERIC);
+ x86_pmu.num_events = X86_PMC_MAX_GENERIC;
}
- perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
- perf_max_counters = x86_pmu.num_counters;
+ perf_event_mask = (1 << x86_pmu.num_events) - 1;
+ perf_max_events = x86_pmu.num_events;
- if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
- WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
- x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
- x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
+ if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+ WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+ x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
+ x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
}
- perf_counter_mask |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
- x86_pmu.intel_ctrl = perf_counter_mask;
+ perf_event_mask |=
+ ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
+ x86_pmu.intel_ctrl = perf_event_mask;
- perf_counters_lapic_init();
- register_die_notifier(&perf_counter_nmi_notifier);
+ perf_events_lapic_init();
+ register_die_notifier(&perf_event_nmi_notifier);
- pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.counter_bits);
- pr_info("... generic counters: %d\n", x86_pmu.num_counters);
- pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
- pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
- pr_info("... counter mask: %016Lx\n", perf_counter_mask);
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.event_bits);
+ pr_info("... generic registers: %d\n", x86_pmu.num_events);
+ pr_info("... value mask: %016Lx\n", x86_pmu.event_mask);
+ pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+ pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
+ pr_info("... event mask: %016Lx\n", perf_event_mask);
}
-static inline void x86_pmu_read(struct perf_counter *counter)
+static inline void x86_pmu_read(struct perf_event *event)
{
- x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+ x86_perf_event_update(event, &event->hw, event->hw.idx);
}
static const struct pmu pmu = {
@@ -2088,13 +2224,52 @@ static const struct pmu pmu = {
.unthrottle = x86_pmu_unthrottle,
};
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+static int
+validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct hw_perf_event fake_event = event->hw;
+
+ if (event->pmu && event->pmu != &pmu)
+ return 0;
+
+ return x86_schedule_event(cpuc, &fake_event) >= 0;
+}
+
+static int validate_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+ struct cpu_hw_events fake_pmu;
+
+ memset(&fake_pmu, 0, sizeof(fake_pmu));
+
+ if (!validate_event(&fake_pmu, leader))
+ return -ENOSPC;
+
+ list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+ if (!validate_event(&fake_pmu, sibling))
+ return -ENOSPC;
+ }
+
+ if (!validate_event(&fake_pmu, event))
+ return -ENOSPC;
+
+ return 0;
+}
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
{
int err;
- err = __hw_perf_counter_init(counter);
- if (err)
+ err = __hw_perf_event_init(event);
+ if (!err) {
+ if (event->group_leader != event)
+ err = validate_group(event);
+ }
+ if (err) {
+ if (event->destroy)
+ event->destroy(event);
return ERR_PTR(err);
+ }
return &pmu;
}
@@ -2110,8 +2285,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
entry->ip[entry->nr++] = ip;
}
-static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
static DEFINE_PER_CPU(int, in_nmi_frame);
@@ -2264,9 +2439,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
struct perf_callchain_entry *entry;
if (in_nmi())
- entry = &__get_cpu_var(nmi_entry);
+ entry = &__get_cpu_var(pmc_nmi_entry);
else
- entry = &__get_cpu_var(irq_entry);
+ entry = &__get_cpu_var(pmc_irq_entry);
entry->nr = 0;
@@ -2275,7 +2450,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}
-void hw_perf_counter_setup_online(int cpu)
+void hw_perf_event_setup_online(int cpu)
{
init_debug_store_on_cpu(cpu);
}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 392bea43b89..898df9719af 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
#include <linux/kprobes.h>
#include <asm/apic.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
struct nmi_watchdog_ctlblk {
unsigned int cccr_msr;
@@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void)
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
- boot_cpu_data.x86 != 16)
+ boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
return;
wd_ops = &k7_wd_ops;
break;
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 00000000000..a640ae5ad20
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
+#include <linux/sched.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/irqflags.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
+
+static unsigned long scale_aperfmperf(void)
+{
+ struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
+ unsigned long ratio, flags;
+
+ local_irq_save(flags);
+ get_aperfmperf(&val);
+ local_irq_restore(flags);
+
+ ratio = calc_aperfmperf_ratio(old, &val);
+ *old = val;
+
+ return ratio;
+}
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+ /*
+ * do aperf/mperf on the cpu level because it includes things
+ * like turbo mode, which are relevant to full cores.
+ */
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ return scale_aperfmperf();
+
+ /*
+ * maybe have something cpufreq here
+ */
+
+ return default_scale_freq_power(sd, cpu);
+}
+
+unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+ /*
+ * aperf/mperf already includes the smt gain
+ */
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ return SCHED_LOAD_SCALE;
+
+ return default_scale_smt_power(sd, cpu);
+}
+
+#endif
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index bc24f514ec9..1cbed97b59c 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,6 +24,7 @@
#include <linux/dmi.h>
#include <asm/div64.h>
#include <asm/vmware.h>
+#include <asm/x86_init.h>
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -47,21 +48,35 @@ static inline int __vmware_platform(void)
return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
}
-static unsigned long __vmware_get_tsc_khz(void)
+static unsigned long vmware_get_tsc_khz(void)
{
uint64_t tsc_hz;
uint32_t eax, ebx, ecx, edx;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
- if (ebx == UINT_MAX)
- return 0;
tsc_hz = eax | (((uint64_t)ebx) << 32);
do_div(tsc_hz, 1000);
BUG_ON(tsc_hz >> 32);
+ printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_hz / 1000,
+ (unsigned long) tsc_hz % 1000);
return tsc_hz;
}
+void __init vmware_platform_setup(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+
+ VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+
+ if (ebx != UINT_MAX)
+ x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+ else
+ printk(KERN_WARNING
+ "Failed to get TSC freq from the hypervisor\n");
+}
+
/*
* While checking the dmi string infomation, just checking the product
* serial key should be enough, as this will always have a VMware
@@ -87,12 +102,6 @@ int vmware_platform(void)
return 0;
}
-unsigned long vmware_get_tsc_khz(void)
-{
- BUG_ON(!vmware_platform());
- return __vmware_get_tsc_khz();
-}
-
/*
* VMware hypervisor takes care of exporting a reliable TSC to the guest.
* Still, due to timing difference when running on virtual cpus, the TSC can
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 48e8e6558b2..7ef24a79699 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
.notifier_call = cpuid_class_cpu_callback,
};
-static char *cpuid_nodename(struct device *dev)
+static char *cpuid_devnode(struct device *dev, mode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
}
@@ -198,7 +198,7 @@ static int __init cpuid_init(void)
err = PTR_ERR(cpuid_class);
goto out_chrdev;
}
- cpuid_class->nodename = cpuid_nodename;
+ cpuid_class->devnode = cpuid_devnode;
for_each_online_cpu(i) {
err = cpuid_device_create(i);
if (err != 0)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a..a4849c10a77 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,7 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
-#include <asm/iommu.h>
-
+#include <asm/x86_init.h>
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#endif
#ifdef CONFIG_X86_64
- pci_iommu_shutdown();
+ x86_platform.iommu_shutdown();
#endif
crash_save_cpu(regs, safe_smp_processor_id());
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index f7cdb3b457a..cd97ce18c29 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -16,6 +16,22 @@ static void *kdump_buf_page;
/* Stores the physical address of elf header of crash image. */
unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+static inline bool is_crashed_pfn_valid(unsigned long pfn)
+{
+#ifndef CONFIG_X86_PAE
+ /*
+ * non-PAE kdump kernel executed from a PAE one will crop high pte
+ * bits and poke unwanted space counting again from address 0, we
+ * don't want that. pte must fit into unsigned long. In fact the
+ * test checks high 12 bits for being zero (pfn will be shifted left
+ * by PAGE_SHIFT).
+ */
+ return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
+#else
+ return true;
+#endif
+}
+
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
@@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!csize)
return 0;
+ if (!is_crashed_pfn_valid(pfn))
+ return -EFAULT;
+
vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
if (!userbuf) {
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d433..b8ce165dde5 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -268,11 +268,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
show_registers(regs);
#ifdef CONFIG_X86_32
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
+ if (user_mode_vm(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
+ } else {
+ sp = kernel_stack_pointer(regs);
+ savesegment(ss, ss);
}
printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9..f7dd2a7c3bf 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -5,7 +5,6 @@
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
-#include <linux/utsname.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
#include <linux/module.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a327676..a071e6be177 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -5,7 +5,6 @@
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
-#include <linux/utsname.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
#include <linux/module.h>
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 147005a1cc3..d17d482a04f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void)
struct resource *res;
u64 end;
- res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+ res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
e820_res = res;
for (i = 0; i < e820.nr_map; i++) {
end = e820.map[i].addr + e820.map[i].size - 1;
@@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos)
if (mb < 16)
return 1024*1024;
- /* To 32MB for anything above that */
- return 32*1024*1024;
+ /* To 64MB for anything above that */
+ return 64*1024*1024;
}
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
@@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void)
return who;
}
-char *__init __attribute__((weak)) machine_specific_memory_setup(void)
-{
- if (x86_quirks->arch_memory_setup) {
- char *who = x86_quirks->arch_memory_setup();
-
- if (who)
- return who;
- }
- return default_machine_specific_memory_setup();
-}
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
- return machine_specific_memory_setup();
-}
-
void __init setup_memory_map(void)
{
char *who;
- who = memory_setup();
+ who = x86_init.resources.memory_setup();
memcpy(&e820_saved, &e820, sizeof(struct e820map));
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
e820_print_map(who);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 335f049d110..b9c830c12b4 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -160,721 +160,6 @@ static struct console early_serial_console = {
.index = -1,
};
-#ifdef CONFIG_EARLY_PRINTK_DBGP
-
-static struct ehci_caps __iomem *ehci_caps;
-static struct ehci_regs __iomem *ehci_regs;
-static struct ehci_dbg_port __iomem *ehci_debug;
-static unsigned int dbgp_endpoint_out;
-
-struct ehci_dev {
- u32 bus;
- u32 slot;
- u32 func;
-};
-
-static struct ehci_dev ehci_dev;
-
-#define USB_DEBUG_DEVNUM 127
-
-#define DBGP_DATA_TOGGLE 0x8800
-
-static inline u32 dbgp_pid_update(u32 x, u32 tok)
-{
- return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
-}
-
-static inline u32 dbgp_len_update(u32 x, u32 len)
-{
- return (x & ~0x0f) | (len & 0x0f);
-}
-
-/*
- * USB Packet IDs (PIDs)
- */
-
-/* token */
-#define USB_PID_OUT 0xe1
-#define USB_PID_IN 0x69
-#define USB_PID_SOF 0xa5
-#define USB_PID_SETUP 0x2d
-/* handshake */
-#define USB_PID_ACK 0xd2
-#define USB_PID_NAK 0x5a
-#define USB_PID_STALL 0x1e
-#define USB_PID_NYET 0x96
-/* data */
-#define USB_PID_DATA0 0xc3
-#define USB_PID_DATA1 0x4b
-#define USB_PID_DATA2 0x87
-#define USB_PID_MDATA 0x0f
-/* Special */
-#define USB_PID_PREAMBLE 0x3c
-#define USB_PID_ERR 0x3c
-#define USB_PID_SPLIT 0x78
-#define USB_PID_PING 0xb4
-#define USB_PID_UNDEF_0 0xf0
-
-#define USB_PID_DATA_TOGGLE 0x88
-#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
-
-#define PCI_CAP_ID_EHCI_DEBUG 0xa
-
-#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
-#define HUB_SHORT_RESET_TIME 10
-#define HUB_LONG_RESET_TIME 200
-#define HUB_RESET_TIMEOUT 500
-
-#define DBGP_MAX_PACKET 8
-
-static int dbgp_wait_until_complete(void)
-{
- u32 ctrl;
- int loop = 0x100000;
-
- do {
- ctrl = readl(&ehci_debug->control);
- /* Stop when the transaction is finished */
- if (ctrl & DBGP_DONE)
- break;
- } while (--loop > 0);
-
- if (!loop)
- return -1;
-
- /*
- * Now that we have observed the completed transaction,
- * clear the done bit.
- */
- writel(ctrl | DBGP_DONE, &ehci_debug->control);
- return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
-}
-
-static void __init dbgp_mdelay(int ms)
-{
- int i;
-
- while (ms--) {
- for (i = 0; i < 1000; i++)
- outb(0x1, 0x80);
- }
-}
-
-static void dbgp_breath(void)
-{
- /* Sleep to give the debug port a chance to breathe */
-}
-
-static int dbgp_wait_until_done(unsigned ctrl)
-{
- u32 pids, lpid;
- int ret;
- int loop = 3;
-
-retry:
- writel(ctrl | DBGP_GO, &ehci_debug->control);
- ret = dbgp_wait_until_complete();
- pids = readl(&ehci_debug->pids);
- lpid = DBGP_PID_GET(pids);
-
- if (ret < 0)
- return ret;
-
- /*
- * If the port is getting full or it has dropped data
- * start pacing ourselves, not necessary but it's friendly.
- */
- if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
- dbgp_breath();
-
- /* If I get a NACK reissue the transmission */
- if (lpid == USB_PID_NAK) {
- if (--loop > 0)
- goto retry;
- }
-
- return ret;
-}
-
-static void dbgp_set_data(const void *buf, int size)
-{
- const unsigned char *bytes = buf;
- u32 lo, hi;
- int i;
-
- lo = hi = 0;
- for (i = 0; i < 4 && i < size; i++)
- lo |= bytes[i] << (8*i);
- for (; i < 8 && i < size; i++)
- hi |= bytes[i] << (8*(i - 4));
- writel(lo, &ehci_debug->data03);
- writel(hi, &ehci_debug->data47);
-}
-
-static void __init dbgp_get_data(void *buf, int size)
-{
- unsigned char *bytes = buf;
- u32 lo, hi;
- int i;
-
- lo = readl(&ehci_debug->data03);
- hi = readl(&ehci_debug->data47);
- for (i = 0; i < 4 && i < size; i++)
- bytes[i] = (lo >> (8*i)) & 0xff;
- for (; i < 8 && i < size; i++)
- bytes[i] = (hi >> (8*(i - 4))) & 0xff;
-}
-
-static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
- const char *bytes, int size)
-{
- u32 pids, addr, ctrl;
- int ret;
-
- if (size > DBGP_MAX_PACKET)
- return -1;
-
- addr = DBGP_EPADDR(devnum, endpoint);
-
- pids = readl(&ehci_debug->pids);
- pids = dbgp_pid_update(pids, USB_PID_OUT);
-
- ctrl = readl(&ehci_debug->control);
- ctrl = dbgp_len_update(ctrl, size);
- ctrl |= DBGP_OUT;
- ctrl |= DBGP_GO;
-
- dbgp_set_data(bytes, size);
- writel(addr, &ehci_debug->address);
- writel(pids, &ehci_debug->pids);
-
- ret = dbgp_wait_until_done(ctrl);
- if (ret < 0)
- return ret;
-
- return ret;
-}
-
-static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
- int size)
-{
- u32 pids, addr, ctrl;
- int ret;
-
- if (size > DBGP_MAX_PACKET)
- return -1;
-
- addr = DBGP_EPADDR(devnum, endpoint);
-
- pids = readl(&ehci_debug->pids);
- pids = dbgp_pid_update(pids, USB_PID_IN);
-
- ctrl = readl(&ehci_debug->control);
- ctrl = dbgp_len_update(ctrl, size);
- ctrl &= ~DBGP_OUT;
- ctrl |= DBGP_GO;
-
- writel(addr, &ehci_debug->address);
- writel(pids, &ehci_debug->pids);
- ret = dbgp_wait_until_done(ctrl);
- if (ret < 0)
- return ret;
-
- if (size > ret)
- size = ret;
- dbgp_get_data(data, size);
- return ret;
-}
-
-static int __init dbgp_control_msg(unsigned devnum, int requesttype,
- int request, int value, int index, void *data, int size)
-{
- u32 pids, addr, ctrl;
- struct usb_ctrlrequest req;
- int read;
- int ret;
-
- read = (requesttype & USB_DIR_IN) != 0;
- if (size > (read ? DBGP_MAX_PACKET:0))
- return -1;
-
- /* Compute the control message */
- req.bRequestType = requesttype;
- req.bRequest = request;
- req.wValue = cpu_to_le16(value);
- req.wIndex = cpu_to_le16(index);
- req.wLength = cpu_to_le16(size);
-
- pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
- addr = DBGP_EPADDR(devnum, 0);
-
- ctrl = readl(&ehci_debug->control);
- ctrl = dbgp_len_update(ctrl, sizeof(req));
- ctrl |= DBGP_OUT;
- ctrl |= DBGP_GO;
-
- /* Send the setup message */
- dbgp_set_data(&req, sizeof(req));
- writel(addr, &ehci_debug->address);
- writel(pids, &ehci_debug->pids);
- ret = dbgp_wait_until_done(ctrl);
- if (ret < 0)
- return ret;
-
- /* Read the result */
- return dbgp_bulk_read(devnum, 0, data, size);
-}
-
-
-/* Find a PCI capability */
-static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
-{
- u8 pos;
- int bytes;
-
- if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
- PCI_STATUS_CAP_LIST))
- return 0;
-
- pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
- for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
- u8 id;
-
- pos &= ~3;
- id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
- if (id == 0xff)
- break;
- if (id == cap)
- return pos;
-
- pos = read_pci_config_byte(num, slot, func,
- pos+PCI_CAP_LIST_NEXT);
- }
- return 0;
-}
-
-static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
-{
- u32 class;
-
- class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
- if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
- return 0;
-
- return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
-}
-
-static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
-{
- u32 bus, slot, func;
-
- for (bus = 0; bus < 256; bus++) {
- for (slot = 0; slot < 32; slot++) {
- for (func = 0; func < 8; func++) {
- unsigned cap;
-
- cap = __find_dbgp(bus, slot, func);
-
- if (!cap)
- continue;
- if (ehci_num-- != 0)
- continue;
- *rbus = bus;
- *rslot = slot;
- *rfunc = func;
- return cap;
- }
- }
- }
- return 0;
-}
-
-static int __init ehci_reset_port(int port)
-{
- u32 portsc;
- u32 delay_time, delay;
- int loop;
-
- /* Reset the usb debug port */
- portsc = readl(&ehci_regs->port_status[port - 1]);
- portsc &= ~PORT_PE;
- portsc |= PORT_RESET;
- writel(portsc, &ehci_regs->port_status[port - 1]);
-
- delay = HUB_ROOT_RESET_TIME;
- for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
- delay_time += delay) {
- dbgp_mdelay(delay);
-
- portsc = readl(&ehci_regs->port_status[port - 1]);
- if (portsc & PORT_RESET) {
- /* force reset to complete */
- loop = 2;
- writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
- &ehci_regs->port_status[port - 1]);
- do {
- portsc = readl(&ehci_regs->port_status[port-1]);
- } while ((portsc & PORT_RESET) && (--loop > 0));
- }
-
- /* Device went away? */
- if (!(portsc & PORT_CONNECT))
- return -ENOTCONN;
-
- /* bomb out completely if something weird happend */
- if ((portsc & PORT_CSC))
- return -EINVAL;
-
- /* If we've finished resetting, then break out of the loop */
- if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
- return 0;
- }
- return -EBUSY;
-}
-
-static int __init ehci_wait_for_port(int port)
-{
- u32 status;
- int ret, reps;
-
- for (reps = 0; reps < 3; reps++) {
- dbgp_mdelay(100);
- status = readl(&ehci_regs->status);
- if (status & STS_PCD) {
- ret = ehci_reset_port(port);
- if (ret == 0)
- return 0;
- }
- }
- return -ENOTCONN;
-}
-
-#ifdef DBGP_DEBUG
-# define dbgp_printk early_printk
-#else
-static inline void dbgp_printk(const char *fmt, ...) { }
-#endif
-
-typedef void (*set_debug_port_t)(int port);
-
-static void __init default_set_debug_port(int port)
-{
-}
-
-static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
-
-static void __init nvidia_set_debug_port(int port)
-{
- u32 dword;
- dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
- 0x74);
- dword &= ~(0x0f<<12);
- dword |= ((port & 0x0f)<<12);
- write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
- dword);
- dbgp_printk("set debug port to %d\n", port);
-}
-
-static void __init detect_set_debug_port(void)
-{
- u32 vendorid;
-
- vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
- 0x00);
-
- if ((vendorid & 0xffff) == 0x10de) {
- dbgp_printk("using nvidia set_debug_port\n");
- set_debug_port = nvidia_set_debug_port;
- }
-}
-
-static int __init ehci_setup(void)
-{
- struct usb_debug_descriptor dbgp_desc;
- u32 cmd, ctrl, status, portsc, hcs_params;
- u32 debug_port, new_debug_port = 0, n_ports;
- u32 devnum;
- int ret, i;
- int loop;
- int port_map_tried;
- int playtimes = 3;
-
-try_next_time:
- port_map_tried = 0;
-
-try_next_port:
-
- hcs_params = readl(&ehci_caps->hcs_params);
- debug_port = HCS_DEBUG_PORT(hcs_params);
- n_ports = HCS_N_PORTS(hcs_params);
-
- dbgp_printk("debug_port: %d\n", debug_port);
- dbgp_printk("n_ports: %d\n", n_ports);
-
- for (i = 1; i <= n_ports; i++) {
- portsc = readl(&ehci_regs->port_status[i-1]);
- dbgp_printk("portstatus%d: %08x\n", i, portsc);
- }
-
- if (port_map_tried && (new_debug_port != debug_port)) {
- if (--playtimes) {
- set_debug_port(new_debug_port);
- goto try_next_time;
- }
- return -1;
- }
-
- loop = 10;
- /* Reset the EHCI controller */
- cmd = readl(&ehci_regs->command);
- cmd |= CMD_RESET;
- writel(cmd, &ehci_regs->command);
- do {
- cmd = readl(&ehci_regs->command);
- } while ((cmd & CMD_RESET) && (--loop > 0));
-
- if (!loop) {
- dbgp_printk("can not reset ehci\n");
- return -1;
- }
- dbgp_printk("ehci reset done\n");
-
- /* Claim ownership, but do not enable yet */
- ctrl = readl(&ehci_debug->control);
- ctrl |= DBGP_OWNER;
- ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
- writel(ctrl, &ehci_debug->control);
-
- /* Start the ehci running */
- cmd = readl(&ehci_regs->command);
- cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
- cmd |= CMD_RUN;
- writel(cmd, &ehci_regs->command);
-
- /* Ensure everything is routed to the EHCI */
- writel(FLAG_CF, &ehci_regs->configured_flag);
-
- /* Wait until the controller is no longer halted */
- loop = 10;
- do {
- status = readl(&ehci_regs->status);
- } while ((status & STS_HALT) && (--loop > 0));
-
- if (!loop) {
- dbgp_printk("ehci can be started\n");
- return -1;
- }
- dbgp_printk("ehci started\n");
-
- /* Wait for a device to show up in the debug port */
- ret = ehci_wait_for_port(debug_port);
- if (ret < 0) {
- dbgp_printk("No device found in debug port\n");
- goto next_debug_port;
- }
- dbgp_printk("ehci wait for port done\n");
-
- /* Enable the debug port */
- ctrl = readl(&ehci_debug->control);
- ctrl |= DBGP_CLAIM;
- writel(ctrl, &ehci_debug->control);
- ctrl = readl(&ehci_debug->control);
- if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
- dbgp_printk("No device in debug port\n");
- writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
- goto err;
- }
- dbgp_printk("debug ported enabled\n");
-
- /* Completely transfer the debug device to the debug controller */
- portsc = readl(&ehci_regs->port_status[debug_port - 1]);
- portsc &= ~PORT_PE;
- writel(portsc, &ehci_regs->port_status[debug_port - 1]);
-
- dbgp_mdelay(100);
-
- /* Find the debug device and make it device number 127 */
- for (devnum = 0; devnum <= 127; devnum++) {
- ret = dbgp_control_msg(devnum,
- USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
- USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
- &dbgp_desc, sizeof(dbgp_desc));
- if (ret > 0)
- break;
- }
- if (devnum > 127) {
- dbgp_printk("Could not find attached debug device\n");
- goto err;
- }
- if (ret < 0) {
- dbgp_printk("Attached device is not a debug device\n");
- goto err;
- }
- dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
-
- /* Move the device to 127 if it isn't already there */
- if (devnum != USB_DEBUG_DEVNUM) {
- ret = dbgp_control_msg(devnum,
- USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
- USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
- if (ret < 0) {
- dbgp_printk("Could not move attached device to %d\n",
- USB_DEBUG_DEVNUM);
- goto err;
- }
- devnum = USB_DEBUG_DEVNUM;
- dbgp_printk("debug device renamed to 127\n");
- }
-
- /* Enable the debug interface */
- ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
- USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
- USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
- if (ret < 0) {
- dbgp_printk(" Could not enable the debug device\n");
- goto err;
- }
- dbgp_printk("debug interface enabled\n");
-
- /* Perform a small write to get the even/odd data state in sync
- */
- ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
- if (ret < 0) {
- dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
- goto err;
- }
- dbgp_printk("small write doned\n");
-
- return 0;
-err:
- /* Things didn't work so remove my claim */
- ctrl = readl(&ehci_debug->control);
- ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
- writel(ctrl, &ehci_debug->control);
- return -1;
-
-next_debug_port:
- port_map_tried |= (1<<(debug_port - 1));
- new_debug_port = ((debug_port-1+1)%n_ports) + 1;
- if (port_map_tried != ((1<<n_ports) - 1)) {
- set_debug_port(new_debug_port);
- goto try_next_port;
- }
- if (--playtimes) {
- set_debug_port(new_debug_port);
- goto try_next_time;
- }
-
- return -1;
-}
-
-static int __init early_dbgp_init(char *s)
-{
- u32 debug_port, bar, offset;
- u32 bus, slot, func, cap;
- void __iomem *ehci_bar;
- u32 dbgp_num;
- u32 bar_val;
- char *e;
- int ret;
- u8 byte;
-
- if (!early_pci_allowed())
- return -1;
-
- dbgp_num = 0;
- if (*s)
- dbgp_num = simple_strtoul(s, &e, 10);
- dbgp_printk("dbgp_num: %d\n", dbgp_num);
-
- cap = find_dbgp(dbgp_num, &bus, &slot, &func);
- if (!cap)
- return -1;
-
- dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
- func);
-
- debug_port = read_pci_config(bus, slot, func, cap);
- bar = (debug_port >> 29) & 0x7;
- bar = (bar * 4) + 0xc;
- offset = (debug_port >> 16) & 0xfff;
- dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
- if (bar != PCI_BASE_ADDRESS_0) {
- dbgp_printk("only debug ports on bar 1 handled.\n");
-
- return -1;
- }
-
- bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
- dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
- if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
- dbgp_printk("only simple 32bit mmio bars supported\n");
-
- return -1;
- }
-
- /* double check if the mem space is enabled */
- byte = read_pci_config_byte(bus, slot, func, 0x04);
- if (!(byte & 0x2)) {
- byte |= 0x02;
- write_pci_config_byte(bus, slot, func, 0x04, byte);
- dbgp_printk("mmio for ehci enabled\n");
- }
-
- /*
- * FIXME I don't have the bar size so just guess PAGE_SIZE is more
- * than enough. 1K is the biggest I have seen.
- */
- set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
- ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
- ehci_bar += bar_val & ~PAGE_MASK;
- dbgp_printk("ehci_bar: %p\n", ehci_bar);
-
- ehci_caps = ehci_bar;
- ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
- ehci_debug = ehci_bar + offset;
- ehci_dev.bus = bus;
- ehci_dev.slot = slot;
- ehci_dev.func = func;
-
- detect_set_debug_port();
-
- ret = ehci_setup();
- if (ret < 0) {
- dbgp_printk("ehci_setup failed\n");
- ehci_debug = NULL;
-
- return -1;
- }
-
- return 0;
-}
-
-static void early_dbgp_write(struct console *con, const char *str, u32 n)
-{
- int chunk, ret;
-
- if (!ehci_debug)
- return;
- while (n > 0) {
- chunk = n;
- if (chunk > DBGP_MAX_PACKET)
- chunk = DBGP_MAX_PACKET;
- ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
- dbgp_endpoint_out, str, chunk);
- str += chunk;
- n -= chunk;
- }
-}
-
-static struct console early_dbgp_console = {
- .name = "earlydbg",
- .write = early_dbgp_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
-#endif
-
/* Direct interface for emergencies */
static struct console *early_console = &early_vga_console;
static int __initdata early_console_initialized;
@@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...)
va_end(ap);
}
+static inline void early_console_register(struct console *con, int keep_early)
+{
+ if (early_console->index != -1) {
+ printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
+ con->name);
+ return;
+ }
+ early_console = con;
+ if (keep_early)
+ early_console->flags &= ~CON_BOOT;
+ else
+ early_console->flags |= CON_BOOT;
+ register_console(early_console);
+}
static int __init setup_early_printk(char *buf)
{
- int keep_early;
+ int keep;
if (!buf)
return 0;
@@ -903,42 +202,37 @@ static int __init setup_early_printk(char *buf)
return 0;
early_console_initialized = 1;
- keep_early = (strstr(buf, "keep") != NULL);
-
- if (!strncmp(buf, "serial", 6)) {
- early_serial_init(buf + 6);
- early_console = &early_serial_console;
- } else if (!strncmp(buf, "ttyS", 4)) {
- early_serial_init(buf);
- early_console = &early_serial_console;
- } else if (!strncmp(buf, "vga", 3)
- && boot_params.screen_info.orig_video_isVGA == 1) {
- max_xpos = boot_params.screen_info.orig_video_cols;
- max_ypos = boot_params.screen_info.orig_video_lines;
- current_ypos = boot_params.screen_info.orig_y;
- early_console = &early_vga_console;
+ keep = (strstr(buf, "keep") != NULL);
+
+ while (*buf != '\0') {
+ if (!strncmp(buf, "serial", 6)) {
+ buf += 6;
+ early_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ if (!strncmp(buf, ",ttyS", 5))
+ buf += 5;
+ }
+ if (!strncmp(buf, "ttyS", 4)) {
+ early_serial_init(buf + 4);
+ early_console_register(&early_serial_console, keep);
+ }
+ if (!strncmp(buf, "vga", 3) &&
+ boot_params.screen_info.orig_video_isVGA == 1) {
+ max_xpos = boot_params.screen_info.orig_video_cols;
+ max_ypos = boot_params.screen_info.orig_video_lines;
+ current_ypos = boot_params.screen_info.orig_y;
+ early_console_register(&early_vga_console, keep);
+ }
#ifdef CONFIG_EARLY_PRINTK_DBGP
- } else if (!strncmp(buf, "dbgp", 4)) {
- if (early_dbgp_init(buf+4) < 0)
- return 0;
- early_console = &early_dbgp_console;
- /*
- * usb subsys will reset ehci controller, so don't keep
- * that early console
- */
- keep_early = 0;
+ if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
+ early_console_register(&early_dbgp_console, keep);
#endif
#ifdef CONFIG_HVC_XEN
- } else if (!strncmp(buf, "xen", 3)) {
- early_console = &xenboot_console;
+ if (!strncmp(buf, "xen", 3))
+ early_console_register(&xenboot_console, keep);
#endif
+ buf++;
}
-
- if (keep_early)
- early_console->flags &= ~CON_BOOT;
- else
- early_console->flags |= CON_BOOT;
- register_console(early_console);
return 0;
}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index fe26ba3e345..cdcfb122f25 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -42,6 +42,7 @@
#include <asm/time.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
#define EFI_DEBUG 1
#define PFX "EFI: "
@@ -453,6 +454,11 @@ void __init efi_init(void)
if (add_efi_memmap)
do_add_efi_memmap();
+#ifdef CONFIG_X86_32
+ x86_platform.get_wallclock = efi_get_time;
+ x86_platform.set_wallclock = efi_set_rtc_mmss;
+#endif
+
/* Setup for EFI runtime service */
reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c..50b9c220e12 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
END(ret_from_fork)
/*
+ * Interrupt exit functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
* go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
END(resume_kernel)
#endif
CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+ .popsection
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
+/*
+ * syscall stub including irq exit should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,6 +717,10 @@ syscall_badsys:
jmp resume_userspace
END(syscall_badsys)
CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+ .popsection
/*
* System calls that need a pt_regs pointer.
@@ -814,6 +830,10 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
+/*
+ * Irq entries should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
@@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug)
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
+/*
+ * End of kprobes section
+ */
+ .popsection
ENTRY(kernel_thread_helper)
pushl $0 # fake return address for unwinder
@@ -1185,17 +1209,14 @@ END(ftrace_graph_caller)
.globl return_to_handler
return_to_handler:
- pushl $0
pushl %eax
- pushl %ecx
pushl %edx
movl %ebp, %eax
call ftrace_return_to_handler
- movl %eax, 0xc(%esp)
+ movl %eax, %ecx
popl %edx
- popl %ecx
popl %eax
- ret
+ jmp *%ecx
#endif
.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be74510..4deb8fc849d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)
END(ftrace_graph_caller)
GLOBAL(return_to_handler)
- subq $80, %rsp
+ subq $24, %rsp
/* Save the return values */
movq %rax, (%rsp)
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
call ftrace_return_to_handler
- movq %rax, 72(%rsp)
+ movq %rax, %rdi
movq 8(%rsp), %rdx
movq (%rsp), %rax
- addq $72, %rsp
- retq
+ addq $24, %rsp
+ jmp *%rdi
#endif
@@ -536,20 +536,13 @@ sysret_signal:
bt $TIF_SYSCALL_AUDIT,%edx
jc sysret_audit
#endif
- /* edx: work flags (arg3) */
- leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
- xorl %esi,%esi # oldset -> arg2
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call do_notify_resume
- RESTORE_TOP_OF_STACK %r11
- RESTORE_REST
- movl $_TIF_WORK_MASK,%edi
- /* Use IRET because user could have changed frame. This
- works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
+ /*
+ * We have a signal, or exit tracing or single-step.
+ * These all wind up with the iret return path anyway,
+ * so just join that path right now.
+ */
+ FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+ jmp int_check_syscall_exit_work
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -654,6 +647,7 @@ int_careful:
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+int_check_syscall_exit_work:
SAVE_REST
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
@@ -809,6 +803,10 @@ END(interrupt)
call \func
.endm
+/*
+ * Interrupt entry/exit should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
@@ -947,6 +945,10 @@ ENTRY(retint_kernel)
CFI_ENDPROC
END(common_interrupt)
+/*
+ * End of kprobes section
+ */
+ .popsection
/*
* APIC interrupts.
@@ -1021,7 +1023,7 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
apicinterrupt LOCAL_PENDING_VECTOR \
perf_pending_interrupt smp_perf_pending_interrupt
#endif
@@ -1497,12 +1499,17 @@ error_kernelspace:
leaq irq_return(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
+ movl %ecx,%eax /* zero extend */
+ cmpq %rax,RIP+8(%rsp)
+ je bstep_iret
cmpq $gs_change,RIP+8(%rsp)
je error_swapgs
jmp error_sti
+
+bstep_iret:
+ /* Fix truncated RIP */
+ movq %rcx,RIP+8(%rsp)
+ jmp error_swapgs
END(error_entry)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e165..5a1b9758fd6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
* the dangers of modifying code on the run.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/uaccess.h>
@@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data)
switch (faulted) {
case 0:
- pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
+ pr_info("converting mcount calls to 0f 1f 44 00 00\n");
memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
break;
case 1:
- pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
+ pr_info("converting mcount calls to 66 66 66 66 90\n");
memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
break;
case 2:
- pr_info("ftrace: converting mcount calls to jmp . + 5\n");
+ pr_info("converting mcount calls to jmp . + 5\n");
memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
break;
}
@@ -468,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
#ifdef CONFIG_FTRACE_SYSCALLS
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
extern unsigned long *sys_call_table;
-static struct syscall_metadata **syscalls_metadata;
-
-static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
-{
- struct syscall_metadata *start;
- struct syscall_metadata *stop;
- char str[KSYM_SYMBOL_LEN];
-
-
- start = (struct syscall_metadata *)__start_syscalls_metadata;
- stop = (struct syscall_metadata *)__stop_syscalls_metadata;
- kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
-
- for ( ; start < stop; start++) {
- if (start->name && !strcmp(start->name, str))
- return start;
- }
- return NULL;
-}
-
-struct syscall_metadata *syscall_nr_to_meta(int nr)
-{
- if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
- return NULL;
-
- return syscalls_metadata[nr];
-}
-
-int syscall_name_to_nr(char *name)
+unsigned long __init arch_syscall_addr(int nr)
{
- int i;
-
- if (!syscalls_metadata)
- return -1;
-
- for (i = 0; i < NR_syscalls; i++) {
- if (syscalls_metadata[i]) {
- if (!strcmp(syscalls_metadata[i]->name, name))
- return i;
- }
- }
- return -1;
-}
-
-void set_syscall_enter_id(int num, int id)
-{
- syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
-{
- syscalls_metadata[num]->exit_id = id;
-}
-
-static int __init arch_init_ftrace_syscalls(void)
-{
- int i;
- struct syscall_metadata *meta;
- unsigned long **psys_syscall_table = &sys_call_table;
-
- syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
- NR_syscalls, GFP_KERNEL);
- if (!syscalls_metadata) {
- WARN_ON(1);
- return -ENOMEM;
- }
-
- for (i = 0; i < NR_syscalls; i++) {
- meta = find_syscall_meta(psys_syscall_table[i]);
- syscalls_metadata[i] = meta;
- }
- return 0;
+ return (unsigned long)(&sys_call_table)[nr];
}
-arch_initcall(arch_init_ftrace_syscalls);
#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3f8579f8d42..4f8e2507e8f 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -11,8 +11,21 @@
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/e820.h>
-#include <asm/bios_ebda.h>
+#include <asm/page.h>
#include <asm/trampoline.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/bios_ebda.h>
+
+static void __init i386_default_early_setup(void)
+{
+ /* Initilize 32bit specific setup functions */
+ x86_init.resources.probe_roms = probe_roms;
+ x86_init.resources.reserve_resources = i386_reserve_resources;
+ x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
+
+ reserve_ebda_region();
+}
void __init i386_start_kernel(void)
{
@@ -29,7 +42,16 @@ void __init i386_start_kernel(void)
reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
- reserve_ebda_region();
+
+ /* Call the subarch specific early setup function */
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_MRST:
+ x86_mrst_early_setup();
+ break;
+ default:
+ i386_default_early_setup();
+ break;
+ }
/*
* At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 70eaa852c73..0b06cd778fd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -23,8 +23,8 @@
#include <asm/sections.h>
#include <asm/kdebug.h>
#include <asm/e820.h>
-#include <asm/bios_ebda.h>
#include <asm/trampoline.h>
+#include <asm/bios_ebda.h>
static void __init zap_identity_mappings(void)
{
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7ffec6b3b33..050c278481b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
* any particular GDT layout, because we load our own as soon as we
* can.
*/
-.section .text.head,"ax",@progbits
+__HEAD
ENTRY(startup_32)
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
@@ -157,6 +157,7 @@ subarch_entries:
.long default_entry /* normal x86/PC */
.long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
+ .long default_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
.previous
#endif /* CONFIG_PARAVIRT */
@@ -607,7 +608,7 @@ ENTRY(initial_code)
/*
* BSS section
*/
-.section ".bss.page_aligned","wa"
+__PAGE_ALIGNED_BSS
.align PAGE_SIZE_asm
#ifdef CONFIG_X86_PAE
swapper_pg_pmd:
@@ -625,7 +626,7 @@ ENTRY(empty_zero_page)
* This starts the data section.
*/
#ifdef CONFIG_X86_PAE
-.section ".data.page_aligned","wa"
+__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
.align PAGE_SIZE_asm
ENTRY(swapper_pg_dir)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a0..22db86a3764 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
- .section .text.head
+ __HEAD
.code64
.globl startup_64
startup_64:
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
*/
lgdt early_gdt_descr(%rip)
- /* set up data segments. actually 0 would do too */
- movl $__KERNEL_DS,%eax
+ /* set up data segments */
+ xorl %eax,%eax
movl %eax,%ds
movl %eax,%ss
movl %eax,%es
@@ -418,7 +418,7 @@ ENTRY(phys_base)
ENTRY(idt_table)
.skip IDT_ENTRIES * 16
- .section .bss.page_aligned, "aw", @nobits
+ __PAGE_ALIGNED_BSS
.align PAGE_SIZE
ENTRY(empty_zero_page)
.skip PAGE_SIZE
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 00000000000..d42f65ac492
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,555 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ * K.Prasad <prasad@linux.vnet.ibm.com>
+ * Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, cpu_dr7);
+EXPORT_PER_CPU_SYMBOL(cpu_dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
+
+
+static inline unsigned long
+__encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ unsigned long bp_info;
+
+ bp_info = (len | type) & 0xf;
+ bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+ bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
+
+ return bp_info;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
+}
+
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7. Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
+{
+ int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+ *len = (bp_info & 0xc) | 0x40;
+ *type = (bp_info & 0x3) | 0x80;
+
+ return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+ if (!*slot) {
+ *slot = bp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return -EBUSY;
+
+ set_debugreg(info->address, i);
+ __get_cpu_var(cpu_debugreg[i]) = info->address;
+
+ dr7 = &__get_cpu_var(cpu_dr7);
+ *dr7 |= encode_dr7(i, info->len, info->type);
+
+ set_debugreg(*dr7, 7);
+
+ return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+ if (*slot == bp) {
+ *slot = NULL;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return;
+
+ dr7 = &__get_cpu_var(cpu_dr7);
+ *dr7 &= ~__encode_dr7(i, info->len, info->type);
+
+ set_debugreg(*dr7, 7);
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+ unsigned int len_in_bytes = 0;
+
+ switch (hbp_len) {
+ case X86_BREAKPOINT_LEN_1:
+ len_in_bytes = 1;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ len_in_bytes = 2;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ len_in_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ len_in_bytes = 8;
+ break;
+#endif
+ }
+ return len_in_bytes;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
+{
+ unsigned int len;
+
+ len = get_hbp_len(hbp_len);
+
+ return (va <= TASK_SIZE - len);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+{
+ unsigned int len;
+
+ len = get_hbp_len(hbp_len);
+
+ return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+static int arch_store_info(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ /*
+ * For kernel-addresses, either the address or symbol name can be
+ * specified.
+ */
+ if (info->name)
+ info->address = (unsigned long)
+ kallsyms_lookup_name(info->name);
+ if (info->address)
+ return 0;
+
+ return -EINVAL;
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type)
+{
+ /* Len */
+ switch (x86_len) {
+ case X86_BREAKPOINT_LEN_1:
+ *gen_len = HW_BREAKPOINT_LEN_1;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ *gen_len = HW_BREAKPOINT_LEN_2;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ *gen_len = HW_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ *gen_len = HW_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ /* Type */
+ switch (x86_type) {
+ case X86_BREAKPOINT_EXECUTE:
+ *gen_type = HW_BREAKPOINT_X;
+ break;
+ case X86_BREAKPOINT_WRITE:
+ *gen_type = HW_BREAKPOINT_W;
+ break;
+ case X86_BREAKPOINT_RW:
+ *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+static int arch_build_bp_info(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+ info->address = bp->attr.bp_addr;
+
+ /* Len */
+ switch (bp->attr.bp_len) {
+ case HW_BREAKPOINT_LEN_1:
+ info->len = X86_BREAKPOINT_LEN_1;
+ break;
+ case HW_BREAKPOINT_LEN_2:
+ info->len = X86_BREAKPOINT_LEN_2;
+ break;
+ case HW_BREAKPOINT_LEN_4:
+ info->len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case HW_BREAKPOINT_LEN_8:
+ info->len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ /* Type */
+ switch (bp->attr.bp_type) {
+ case HW_BREAKPOINT_W:
+ info->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ info->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ info->type = X86_BREAKPOINT_EXECUTE;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct perf_event *bp,
+ struct task_struct *tsk)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned int align;
+ int ret;
+
+
+ ret = arch_build_bp_info(bp);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+
+ if (info->type == X86_BREAKPOINT_EXECUTE)
+ /*
+ * Ptrace-refactoring code
+ * For now, we'll allow instruction breakpoint only for user-space
+ * addresses
+ */
+ if ((!arch_check_va_in_userspace(info->address, info->len)) &&
+ info->len != X86_BREAKPOINT_EXECUTE)
+ return ret;
+
+ switch (info->len) {
+ case X86_BREAKPOINT_LEN_1:
+ align = 0;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ align = 1;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ align = 3;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ align = 7;
+ break;
+#endif
+ default:
+ return ret;
+ }
+
+ if (bp->callback)
+ ret = arch_store_info(bp);
+
+ if (ret < 0)
+ return ret;
+ /*
+ * Check that the low-order bits of the address are appropriate
+ * for the alignment implied by len.
+ */
+ if (info->address & align)
+ return -EINVAL;
+
+ /* Check that the virtual address is in the proper range */
+ if (tsk) {
+ if (!arch_check_va_in_userspace(info->address, info->len))
+ return -EFAULT;
+ } else {
+ if (!arch_check_va_in_kernelspace(info->address, info->len))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Dump the debug register contents to the user.
+ * We can't dump our per cpu values because it
+ * may contain cpu wide breakpoint, something that
+ * doesn't belong to the current task.
+ *
+ * TODO: include non-ptrace user breakpoints (perf)
+ */
+void aout_dump_debugregs(struct user *dump)
+{
+ int i;
+ int dr7 = 0;
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ struct thread_struct *thread = &current->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ bp = thread->ptrace_bps[i];
+
+ if (bp && !bp->attr.disabled) {
+ dump->u_debugreg[i] = bp->attr.bp_addr;
+ info = counter_arch_bp(bp);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ } else {
+ dump->u_debugreg[i] = 0;
+ }
+ }
+
+ dump->u_debugreg[4] = 0;
+ dump->u_debugreg[5] = 0;
+ dump->u_debugreg[6] = current->thread.debugreg6;
+
+ dump->u_debugreg[7] = dr7;
+}
+EXPORT_SYMBOL_GPL(aout_dump_debugregs);
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+ int i;
+ struct thread_struct *t = &tsk->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ unregister_hw_breakpoint(t->ptrace_bps[i]);
+ t->ptrace_bps[i] = NULL;
+ }
+}
+
+void hw_breakpoint_restore(void)
+{
+ set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
+ set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
+ set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
+ set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+ set_debugreg(current->thread.debugreg6, 6);
+ set_debugreg(__get_cpu_var(cpu_dr7), 7);
+}
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+static int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+ int i, cpu, rc = NOTIFY_STOP;
+ struct perf_event *bp;
+ unsigned long dr7, dr6;
+ unsigned long *dr6_p;
+
+ /* The DR6 value is pointed by args->err */
+ dr6_p = (unsigned long *)ERR_PTR(args->err);
+ dr6 = *dr6_p;
+
+ /* Do an early return if no trap bits are set in DR6 */
+ if ((dr6 & DR_TRAP_BITS) == 0)
+ return NOTIFY_DONE;
+
+ get_debugreg(dr7, 7);
+ /* Disable breakpoints during exception handling */
+ set_debugreg(0UL, 7);
+ /*
+ * Assert that local interrupts are disabled
+ * Reset the DRn bits in the virtualized register value.
+ * The ptrace trigger routine will add in whatever is needed.
+ */
+ current->thread.debugreg6 &= ~DR_TRAP_BITS;
+ cpu = get_cpu();
+
+ /* Handle all the breakpoints that were triggered */
+ for (i = 0; i < HBP_NUM; ++i) {
+ if (likely(!(dr6 & (DR_TRAP0 << i))))
+ continue;
+
+ /*
+ * The counter may be concurrently released but that can only
+ * occur from a call_rcu() path. We can then safely fetch
+ * the breakpoint, use its callback, touch its counter
+ * while we are in an rcu_read_lock() path.
+ */
+ rcu_read_lock();
+
+ bp = per_cpu(bp_per_reg[i], cpu);
+ if (bp)
+ rc = NOTIFY_DONE;
+ /*
+ * Reset the 'i'th TRAP bit in dr6 to denote completion of
+ * exception handling
+ */
+ (*dr6_p) &= ~(DR_TRAP0 << i);
+ /*
+ * bp can be NULL due to lazy debug register switching
+ * or due to concurrent perf counter removing.
+ */
+ if (!bp) {
+ rcu_read_unlock();
+ break;
+ }
+
+ (bp->callback)(bp, args->regs);
+
+ rcu_read_unlock();
+ }
+ if (dr6 & (~DR_TRAP_BITS))
+ rc = NOTIFY_DONE;
+
+ set_debugreg(dr7, 7);
+ put_cpu();
+
+ return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_exceptions_notify(
+ struct notifier_block *unused, unsigned long val, void *data)
+{
+ if (val != DIE_DEBUG)
+ return NOTIFY_DONE;
+
+ return hw_breakpoint_handler(data);
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+ /* TODO */
+}
+
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
+{
+ /* TODO */
+}
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 43cec6bdda6..9c3bd4a2050 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -10,6 +10,16 @@
EXPORT_SYMBOL(mcount);
#endif
+/*
+ * Note, this is a prototype to get at the symbol for
+ * the export, but dont use it from C code, it is used
+ * by assembly code and is not using C calling convention!
+ */
+#ifndef CONFIG_X86_CMPXCHG64
+extern void cmpxchg8b_emu(void);
+EXPORT_SYMBOL(cmpxchg8b_emu);
+#endif
+
/* Networking helper routines. */
EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 5cf36c053ac..23c167925a5 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -19,12 +19,6 @@
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
-#ifdef CONFIG_X86_32
-static void pit_disable_clocksource(void);
-#else
-static inline void pit_disable_clocksource(void) { }
-#endif
-
/*
* HPET replaces the PIT, when enabled. So we need to know, which of
* the two timers is used
@@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode,
outb_pit(0, PIT_CH0);
outb_pit(0, PIT_CH0);
}
- pit_disable_clocksource();
break;
case CLOCK_EVT_MODE_ONESHOT:
/* One shot setup */
- pit_disable_clocksource();
outb_pit(0x38, PIT_MODE);
break;
@@ -200,17 +192,6 @@ static struct clocksource pit_cs = {
.shift = 20,
};
-static void pit_disable_clocksource(void)
-{
- /*
- * Use mult to check whether it is registered or not
- */
- if (pit_cs.mult) {
- clocksource_unregister(&pit_cs);
- pit_cs.mult = 0;
- }
-}
-
static int __init init_pit_clocksource(void)
{
/*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 270ff83efc1..3a54dcb9cd0 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
* way process stacks are handled. This is done by having a special
* "init_task" linker map entry..
*/
-union thread_union init_thread_union
- __attribute__((__section__(".data.init_task"))) =
- { INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+ { INIT_THREAD_INFO(init_task) };
/*
* Initial task structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b0cdde6932f..fee6cc2b207 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,10 +63,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
- seq_printf(p, "%*s: ", prec, "CNT");
+ seq_printf(p, "%*s: ", prec, "PMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
- seq_printf(p, " Performance counter interrupts\n");
+ seq_printf(p, " Performance monitoring interrupts\n");
seq_printf(p, "%*s: ", prec, "PND");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
@@ -92,19 +92,19 @@ static int show_other_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
-# endif
#endif
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
@@ -194,13 +194,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->irq_call_count;
sum += irq_stats(cpu)->irq_tlb_count;
#endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
sum += irq_stats(cpu)->irq_threshold_count;
-# endif
#endif
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE
sum += per_cpu(mce_exception_count, cpu);
sum += per_cpu(mce_poll_count, cpu);
#endif
@@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs)
}
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
+{
+ unsigned int irq, vector;
+ static int warned;
+ struct irq_desc *desc;
+
+ for_each_irq_desc(irq, desc) {
+ int break_affinity = 0;
+ int set_affinity = 1;
+ const struct cpumask *affinity;
+
+ if (!desc)
+ continue;
+ if (irq == 2)
+ continue;
+
+ /* interrupt's are disabled at this point */
+ spin_lock(&desc->lock);
+
+ affinity = desc->affinity;
+ if (!irq_has_action(irq) ||
+ cpumask_equal(affinity, cpu_online_mask)) {
+ spin_unlock(&desc->lock);
+ continue;
+ }
+
+ /*
+ * Complete the irq move. This cpu is going down and for
+ * non intr-remapping case, we can't wait till this interrupt
+ * arrives at this cpu before completing the irq move.
+ */
+ irq_force_complete_move(irq);
+
+ if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+ break_affinity = 1;
+ affinity = cpu_all_mask;
+ }
+
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
+ desc->chip->mask(irq);
+
+ if (desc->chip->set_affinity)
+ desc->chip->set_affinity(irq, affinity);
+ else if (!(warned++))
+ set_affinity = 0;
+
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
+ desc->chip->unmask(irq);
+
+ spin_unlock(&desc->lock);
+
+ if (break_affinity && set_affinity)
+ printk("Broke affinity for irq %i\n", irq);
+ else if (!set_affinity)
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+ /*
+ * We can remove mdelay() and then send spuriuous interrupts to
+ * new cpu targets for all the irqs that were handled previously by
+ * this cpu. While it works, I have seen spurious interrupt messages
+ * (nothing wrong but still...).
+ *
+ * So for now, retain mdelay(1) and check the IRR and then send those
+ * interrupts to new targets as this cpu is already offlined...
+ */
+ mdelay(1);
+
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ unsigned int irr;
+
+ if (__get_cpu_var(vector_irq)[vector] < 0)
+ continue;
+
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (irr & (1 << (vector % 32))) {
+ irq = __get_cpu_var(vector_irq)[vector];
+
+ desc = irq_to_desc(irq);
+ spin_lock(&desc->lock);
+ if (desc->chip->retrigger)
+ desc->chip->retrigger(irq);
+ spin_unlock(&desc->lock);
+ }
+ }
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe232..10709f29d16 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
return true;
}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
-void fixup_irqs(void)
-{
- unsigned int irq;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- const struct cpumask *affinity;
-
- if (!desc)
- continue;
- if (irq == 2)
- continue;
-
- affinity = desc->affinity;
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- printk("Breaking affinity for irq %i\n", irq);
- affinity = cpu_all_mask;
- }
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
- else if (desc->action)
- printk_once("Cannot set affinity for irq %i\n", irq);
- }
-
-#if 0
- barrier();
- /* Ingo Molnar says: "after the IO-APIC masks have been redirected
- [note the nop - the interrupt-enable boundary on x86 is two
- instructions from sti] - to flush out pending hardirqs and
- IPIs. After this point nothing is supposed to reach this CPU." */
- __asm__ __volatile__("sti; nop; cli");
- barrier();
-#else
- /* That doesn't seem sufficient. Give it 1ms. */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
-#endif
-}
-#endif
-
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0d..acf8fbf8fbd 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
return true;
}
-#ifdef CONFIG_HOTPLUG_CPU
-/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
-void fixup_irqs(void)
-{
- unsigned int irq;
- static int warned;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- int break_affinity = 0;
- int set_affinity = 1;
- const struct cpumask *affinity;
-
- if (!desc)
- continue;
- if (irq == 2)
- continue;
-
- /* interrupt's are disabled at this point */
- spin_lock(&desc->lock);
-
- affinity = desc->affinity;
- if (!irq_has_action(irq) ||
- cpumask_equal(affinity, cpu_online_mask)) {
- spin_unlock(&desc->lock);
- continue;
- }
-
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- break_affinity = 1;
- affinity = cpu_all_mask;
- }
-
- if (desc->chip->mask)
- desc->chip->mask(irq);
-
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
- else if (!(warned++))
- set_affinity = 0;
-
- if (desc->chip->unmask)
- desc->chip->unmask(irq);
-
- spin_unlock(&desc->lock);
-
- if (break_affinity && set_affinity)
- printk("Broke affinity for irq %i\n", irq);
- else if (!set_affinity)
- printk("Cannot set affinity for irq %i\n", irq);
- }
-
- /* That doesn't seem sufficient. Give it 1ms. */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
-}
-#endif
extern void call_softirq(void);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 92b7703d3d5..40f30773fb2 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
return 0;
}
-static void __init init_ISA_irqs(void)
+void __init init_ISA_irqs(void)
{
int i;
@@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void)
}
}
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+void __init init_IRQ(void)
+{
+ x86_init.irqs.intr_init();
+}
static void __init smp_intr_init(void)
{
@@ -190,7 +192,7 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_X86_MCE_THRESHOLD
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
-#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
#endif
@@ -206,39 +208,19 @@ static void __init apic_intr_init(void)
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_COUNTERS
+# ifdef CONFIG_PERF_EVENTS
alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
# endif
#endif
}
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- * Perform any necessary interrupt initialisation prior to setting up
- * the "ordinary" interrupt call gates. For legacy reasons, the ISA
- * interrupts should be initialised here if the machine emulates a PC
- * in any way.
- **/
-static void __init x86_quirk_pre_intr_init(void)
-{
-#ifdef CONFIG_X86_32
- if (x86_quirks->arch_pre_intr_init) {
- if (x86_quirks->arch_pre_intr_init())
- return;
- }
-#endif
- init_ISA_irqs();
-}
-
void __init native_init_IRQ(void)
{
int i;
/* Execute any quirks before the call gates are initialised: */
- x86_quirk_pre_intr_init();
+ x86_init.irqs.pre_vector_init();
apic_intr_init();
@@ -258,12 +240,6 @@ void __init native_init_IRQ(void)
#ifdef CONFIG_X86_32
/*
- * Call quirks after call gates are initialised (usually add in
- * the architecture specific gates):
- */
- x86_quirk_intr_init();
-
- /*
* External FPU? Set up irq13 if so, for
* original braindamaged IBM FERR coupling.
*/
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3..20a5b368946 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,7 @@
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <asm/debugreg.h>
#include <asm/apicdef.h>
#include <asm/system.h>
@@ -88,7 +89,6 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
gdb_regs[GDB_SS] = __KERNEL_DS;
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
- gdb_regs[GDB_SP] = (int)&regs->sp;
#else
gdb_regs[GDB_R8] = regs->r8;
gdb_regs[GDB_R9] = regs->r9;
@@ -101,8 +101,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
gdb_regs32[GDB_PS] = regs->flags;
gdb_regs32[GDB_CS] = regs->cs;
gdb_regs32[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = regs->sp;
#endif
+ gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
}
/**
@@ -434,6 +434,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
"resuming...\n");
kgdb_arch_handle_exception(args->trapnr, args->signr,
args->err, "c", "", regs);
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
return NOTIFY_STOP;
}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b00..1f3186ce213 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,22 @@
#include <linux/preempt.h>
#include <linux/module.h>
#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
void jprobe_return_end(void);
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-#ifdef CONFIG_X86_64
-#define stack_addr(regs) ((unsigned long *)regs->sp)
-#else
-/*
- * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
- * don't save the ss and esp registers if the CPU is already in kernel
- * mode when it traps. So for kprobes, regs->sp and regs->ss are not
- * the [nonexistent] saved stack pointer and ss register, but rather
- * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
- * point to the top of the pre-int3 stack.
- */
-#define stack_addr(regs) ((unsigned long *)&regs->sp)
-#endif
+#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
/* ----------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
-static const u32 onebyte_has_modrm[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ----------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
- W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
- W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
- W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
- W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
- W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
- W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
- W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
- W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
- W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
- W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
- W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
- W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ----------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
- W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
- W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
- W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
- W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
- W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
- W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
- W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
- W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
- W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
- W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
- W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
#undef W
struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +191,75 @@ retry:
}
}
+/* Recover the probed instruction at addr for further analysis. */
+static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct kprobe *kp;
+ kp = get_kprobe((void *)addr);
+ if (!kp)
+ return -EINVAL;
+
+ /*
+ * Basically, kp->ainsn.insn has an original instruction.
+ * However, RIP-relative instruction can not do single-stepping
+ * at different place, fix_riprel() tweaks the displacement of
+ * that instruction. In that case, we can't recover the instruction
+ * from the kp->ainsn.insn.
+ *
+ * On the other hand, kp->opcode has a copy of the first byte of
+ * the probed instruction, which is overwritten by int3. And
+ * the instruction at kp->addr is not modified by kprobes except
+ * for the first byte, we can recover the original instruction
+ * from it and kp->opcode.
+ */
+ memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+ buf[0] = kp->opcode;
+ return 0;
+}
+
+/* Dummy buffers for kallsyms_lookup */
+static char __dummy_buf[KSYM_NAME_LEN];
+
+/* Check if paddr is at an instruction boundary */
+static int __kprobes can_probe(unsigned long paddr)
+{
+ int ret;
+ unsigned long addr, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ kernel_insn_init(&insn, (void *)addr);
+ insn_get_opcode(&insn);
+
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case we replace the breakpoint by the
+ * original instruction in our buffer.
+ */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf, addr);
+ if (ret)
+ /*
+ * Another debugging subsystem might insert
+ * this breakpoint. In that case, we can't
+ * recover it.
+ */
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ insn_get_length(&insn);
+ addr += insn.length;
+ }
+
+ return (addr == paddr);
+}
+
/*
* Returns non-zero if opcode modifies the interrupt flag.
*/
@@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
static void __kprobes fix_riprel(struct kprobe *p)
{
#ifdef CONFIG_X86_64
- u8 *insn = p->ainsn.insn;
- s64 disp;
- int need_modrm;
-
- /* Skip legacy instruction prefixes. */
- while (1) {
- switch (*insn) {
- case 0x66:
- case 0x67:
- case 0x2e:
- case 0x3e:
- case 0x26:
- case 0x64:
- case 0x65:
- case 0x36:
- case 0xf0:
- case 0xf3:
- case 0xf2:
- ++insn;
- continue;
- }
- break;
- }
+ struct insn insn;
+ kernel_insn_init(&insn, p->ainsn.insn);
- /* Skip REX instruction prefix. */
- if (is_REX_prefix(insn))
- ++insn;
-
- if (*insn == 0x0f) {
- /* Two-byte opcode. */
- ++insn;
- need_modrm = test_bit(*insn,
- (unsigned long *)twobyte_has_modrm);
- } else
- /* One-byte opcode. */
- need_modrm = test_bit(*insn,
- (unsigned long *)onebyte_has_modrm);
-
- if (need_modrm) {
- u8 modrm = *++insn;
- if ((modrm & 0xc7) == 0x05) {
- /* %rip+disp32 addressing mode */
- /* Displacement follows ModRM byte. */
- ++insn;
- /*
- * The copied instruction uses the %rip-relative
- * addressing mode. Adjust the displacement for the
- * difference between the original location of this
- * instruction and the location of the copy that will
- * actually be run. The tricky bit here is making sure
- * that the sign extension happens correctly in this
- * calculation, since we need a signed 32-bit result to
- * be sign-extended to 64 bits when it's added to the
- * %rip value and yield the same 64-bit result that the
- * sign-extension of the original signed 32-bit
- * displacement would have given.
- */
- disp = (u8 *) p->addr + *((s32 *) insn) -
- (u8 *) p->ainsn.insn;
- BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
- *(s32 *)insn = (s32) disp;
- }
+ if (insn_rip_relative(&insn)) {
+ s64 newdisp;
+ u8 *disp;
+ insn_get_displacement(&insn);
+ /*
+ * The copied instruction uses the %rip-relative addressing
+ * mode. Adjust the displacement for the difference between
+ * the original location of this instruction and the location
+ * of the copy that will actually be run. The tricky bit here
+ * is making sure that the sign extension happens correctly in
+ * this calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the %rip
+ * value and yield the same 64-bit result that the sign-
+ * extension of the original signed 32-bit displacement would
+ * have given.
+ */
+ newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
+ (u8 *) p->ainsn.insn;
+ BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
+ disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
+ *(s32 *) disp = (s32) newdisp;
}
#endif
}
@@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
int __kprobes arch_prepare_kprobe(struct kprobe *p)
{
+ if (!can_probe((unsigned long)p->addr))
+ return -EILSEQ;
/* insn: must be on special executable page on x86. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
@@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
{
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
-#ifdef CONFIG_X86_64
- /* TODO: Provide re-entrancy from post_kprobes_handler() and
- * avoid exception stack corruption while single-stepping on
- * the instruction of the new probe.
- */
- arch_disarm_kprobe(p);
- regs->ip = (unsigned long)p->addr;
- reset_current_kprobe();
- preempt_enable_no_resched();
- break;
-#endif
case KPROBE_HIT_ACTIVE:
save_previous_kprobe(kcb);
set_current_kprobe(p, regs, kcb);
@@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
kcb->kprobe_status = KPROBE_REENTER;
break;
case KPROBE_HIT_SS:
- if (p == kprobe_running()) {
- regs->flags &= ~X86_EFLAGS_TF;
- regs->flags |= kcb->kprobe_saved_flags;
- return 0;
- } else {
- /* A probe has been hit in the codepath leading up
- * to, or just after, single-stepping of a probed
- * instruction. This entire codepath should strictly
- * reside in .kprobes.text section. Raise a warning
- * to highlight this peculiar case.
- */
- }
+ /* A probe has been hit in the codepath leading up to, or just
+ * after, single-stepping of a probed instruction. This entire
+ * codepath should strictly reside in .kprobes.text section.
+ * Raise a BUG or we'll continue in an endless reentering loop
+ * and eventually a stack overflow.
+ */
+ printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
+ p->addr);
+ dump_kprobe(p);
+ BUG();
default:
/* impossible cases */
WARN_ON(1);
@@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
ret = NOTIFY_STOP;
break;
case DIE_DEBUG:
- if (post_kprobe_handler(args->regs))
+ if (post_kprobe_handler(args->regs)) {
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
ret = NOTIFY_STOP;
+ }
break;
case DIE_GPF:
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c664d515f61..63b0ec8d3d4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,7 +34,6 @@
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
- enum paravirt_lazy_mode mode;
};
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
@@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len)
{
struct kvm_para_state *state = kvm_para_state();
- if (state->mode != PARAVIRT_LAZY_MMU) {
+ if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
kvm_mmu_op(buffer, len);
return;
}
@@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn)
static void kvm_enter_lazy_mmu(void)
{
- struct kvm_para_state *state = kvm_para_state();
-
paravirt_enter_lazy_mmu();
- state->mode = paravirt_get_lazy_mode();
}
static void kvm_leave_lazy_mmu(void)
@@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void)
mmu_queue_flush(state);
paravirt_leave_lazy_mmu();
- state->mode = paravirt_get_lazy_mode();
}
static void __init paravirt_ops_setup(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 223af43f152..feaeb0d3aa4 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,8 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <linux/percpu.h>
+
+#include <asm/x86_init.h>
#include <asm/reboot.h>
#define KVM_SCALE 22
@@ -50,8 +52,8 @@ static unsigned long kvm_get_wallclock(void)
struct timespec ts;
int low, high;
- low = (int)__pa(&wall_clock);
- high = ((u64)__pa(&wall_clock) >> 32);
+ low = (int)__pa_symbol(&wall_clock);
+ high = ((u64)__pa_symbol(&wall_clock) >> 32);
native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
vcpu_time = &get_cpu_var(hv_clock);
@@ -182,12 +184,13 @@ void __init kvmclock_init(void)
if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
if (kvm_register_clock("boot clock"))
return;
- pv_time_ops.get_wallclock = kvm_get_wallclock;
- pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
- pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
+ x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.get_wallclock = kvm_get_wallclock;
+ x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+ x86_cpuinit.setup_percpu_clockev =
+ kvm_setup_secondary_clock;
#endif
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 71f1d99a635..ec6ef60cbd1 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
#ifdef CONFIG_SMP
preempt_disable();
load_LDT(pc);
- if (!cpus_equal(current->mm->cpu_vm_mask,
- cpumask_of_cpu(smp_processor_id())))
+ if (!cpumask_equal(mm_cpumask(current->mm),
+ cpumask_of(smp_processor_id())))
smp_call_function(flush_ldt, current->mm, 1);
preempt_enable();
#else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d0013..c843f8406da 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
#include <asm/desc.h>
#include <asm/system.h>
#include <asm/cacheflush.h>
+#include <asm/debugreg.h>
static void set_idt(void *newidt, __u16 limit)
{
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ hw_breakpoint_disable();
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e9..4a8bb82248a 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/debugreg.h>
static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ hw_breakpoint_disable();
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 366baa17991..f4c538b681c 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -317,6 +317,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
return UCODE_NFOUND;
}
+ if (*(u32 *)firmware->data != UCODE_MAGIC) {
+ printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n",
+ *(u32 *)firmware->data);
+ return UCODE_ERROR;
+ }
+
ret = generic_load_microcode(cpu, firmware->data, firmware->size);
release_firmware(firmware);
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9371448290a..2bcad3926ed 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -73,7 +73,6 @@
#include <linux/platform_device.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
-#include <linux/smp_lock.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -201,7 +200,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
static int microcode_open(struct inode *unused1, struct file *unused2)
{
- cycle_kernel_lock();
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
}
@@ -210,8 +208,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
{
ssize_t ret = -EINVAL;
- if ((len >> PAGE_SHIFT) > num_physpages) {
- pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+ if ((len >> PAGE_SHIFT) > totalram_pages) {
+ pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);
return ret;
}
@@ -236,7 +234,7 @@ static const struct file_operations microcode_fops = {
static struct miscdevice microcode_dev = {
.minor = MICROCODE_MINOR,
.name = "microcode",
- .devnode = "cpu/microcode",
+ .nodename = "cpu/microcode",
.fops = &microcode_fops,
};
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index fcd513bf284..5be95ef4ffe 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len)
return sum & 0xFF;
}
+int __init default_mpc_apic_id(struct mpc_cpu *m)
+{
+ return m->apicid;
+}
+
static void __init MP_processor_info(struct mpc_cpu *m)
{
int apicid;
@@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
return;
}
- if (x86_quirks->mpc_apic_id)
- apicid = x86_quirks->mpc_apic_id(m);
- else
- apicid = m->apicid;
+ apicid = x86_init.mpparse.mpc_apic_id(m);
if (m->cpuflag & CPU_BOOTPROCESSOR) {
bootup_cpu = " (Bootup-CPU)";
@@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m)
}
#ifdef CONFIG_X86_IO_APIC
-static void __init MP_bus_info(struct mpc_bus *m)
+void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
{
- char str[7];
memcpy(str, m->bustype, 6);
str[6] = 0;
+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+}
- if (x86_quirks->mpc_oem_bus_info)
- x86_quirks->mpc_oem_bus_info(m, str);
- else
- apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+static void __init MP_bus_info(struct mpc_bus *m)
+{
+ char str[7];
+
+ x86_init.mpparse.mpc_oem_bus_info(m, str);
#if MAX_MP_BUSSES < 256
if (m->busid >= MAX_MP_BUSSES) {
@@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
- if (x86_quirks->mpc_oem_pci_bus)
- x86_quirks->mpc_oem_pci_bus(m);
+ if (x86_init.mpparse.mpc_oem_pci_bus)
+ x86_init.mpparse.mpc_oem_pci_bus(m);
clear_bit(m->busid, mp_bus_not_pci);
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
@@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
1, mpc, mpc->length, 1);
}
+void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (early)
return 1;
- if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) {
- struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr;
- x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
- }
+ if (mpc->oemptr)
+ x86_init.mpparse.smp_read_mpc_oem(mpc);
/*
* Now process the configuration blocks.
*/
- if (x86_quirks->mpc_record)
- *x86_quirks->mpc_record = 0;
+ x86_init.mpparse.mpc_record(0);
while (count < mpc->length) {
switch (*mpt) {
@@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
count = mpc->length;
break;
}
- if (x86_quirks->mpc_record)
- (*x86_quirks->mpc_record)++;
+ x86_init.mpparse.mpc_record(1);
}
#ifdef CONFIG_X86_BIGSMP
@@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
/*
* Scan the memory blocks for an SMP configuration block.
*/
-static void __init __get_smp_config(unsigned int early)
+void __init default_get_smp_config(unsigned int early)
{
struct mpf_intel *mpf = mpf_found;
@@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early)
if (acpi_lapic && acpi_ioapic)
return;
- if (x86_quirks->mach_get_smp_config) {
- if (x86_quirks->mach_get_smp_config(early))
- return;
- }
-
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early)
*/
}
-void __init early_get_smp_config(void)
-{
- __get_smp_config(1);
-}
-
-void __init get_smp_config(void)
-{
- __get_smp_config(0);
-}
-
static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
{
unsigned long size = get_mpc_size(mpf->physptr);
@@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
return 0;
}
-static void __init __find_smp_config(unsigned int reserve)
+void __init default_find_smp_config(unsigned int reserve)
{
unsigned int address;
- if (x86_quirks->mach_find_smp_config) {
- if (x86_quirks->mach_find_smp_config(reserve))
- return;
- }
/*
* FIXME: Linux assumes you have 640K of base ram..
* this continues the error...
@@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve)
smp_scan_config(address, 0x400, reserve);
}
-void __init early_find_smp_config(void)
-{
- __find_smp_config(0);
-}
-
-void __init find_smp_config(void)
-{
- __find_smp_config(1);
-}
-
#ifdef CONFIG_X86_IO_APIC
static u8 __initdata irq_used[MAX_IRQ_SOURCES];
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
new file mode 100644
index 00000000000..3b7078abc87
--- /dev/null
+++ b/arch/x86/kernel/mrst.c
@@ -0,0 +1,24 @@
+/*
+ * mrst.c: Intel Moorestown platform specific setup code
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+
+#include <asm/setup.h>
+
+/*
+ * Moorestown specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_mrst_early_setup(void)
+{
+ x86_init.resources.probe_roms = x86_init_noop;
+ x86_init.resources.reserve_resources = x86_init_noop;
+}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index c0061096323..553449951b8 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -237,7 +237,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
.notifier_call = msr_class_cpu_callback,
};
-static char *msr_nodename(struct device *dev)
+static char *msr_devnode(struct device *dev, mode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}
@@ -258,7 +258,7 @@ static int __init msr_init(void)
err = PTR_ERR(msr_class);
goto out_chrdev;
}
- msr_class->nodename = msr_nodename;
+ msr_class->devnode = msr_devnode;
for_each_online_cpu(i) {
err = msr_device_create(i);
if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index f5b0b4a01fb..1b1739d1631 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x)
return x;
}
-static void __init default_banner(void)
+void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
pv_info.name);
}
-char *memory_setup(void)
-{
- return pv_init_ops.memory_setup();
-}
-
/* Simple instruction patching code. */
#define DEF_NATIVE(ops, name, code) \
extern const char start_##ops##_##name[], end_##ops##_##name[]; \
@@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
return insn_len;
}
-void init_IRQ(void)
-{
- pv_irq_ops.init_IRQ();
-}
-
static void native_flush_tlb(void)
{
__native_flush_tlb();
@@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void);
extern void native_usergs_sysret32(void);
extern void native_usergs_sysret64(void);
-static int __init print_banner(void)
-{
- pv_init_ops.banner();
- return 0;
-}
-core_initcall(print_banner);
-
static struct resource reserve_ioports = {
.start = 0,
.end = IO_SPACE_LIMIT,
@@ -320,21 +303,13 @@ struct pv_info pv_info = {
struct pv_init_ops pv_init_ops = {
.patch = native_patch,
- .banner = default_banner,
- .arch_setup = paravirt_nop,
- .memory_setup = machine_specific_memory_setup,
};
struct pv_time_ops pv_time_ops = {
- .time_init = hpet_time_init,
- .get_wallclock = native_get_wallclock,
- .set_wallclock = native_set_wallclock,
.sched_clock = native_sched_clock,
- .get_tsc_khz = native_calibrate_tsc,
};
struct pv_irq_ops pv_irq_ops = {
- .init_IRQ = native_init_IRQ,
.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
@@ -409,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = {
struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
- .setup_boot_clock = setup_boot_APIC_clock,
- .setup_secondary_clock = setup_secondary_APIC_clock,
.startup_ipi_hook = paravirt_nop,
#endif
};
@@ -424,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = {
#endif
struct pv_mmu_ops pv_mmu_ops = {
-#ifndef CONFIG_X86_64
- .pagetable_setup_start = native_pagetable_setup_start,
- .pagetable_setup_done = native_pagetable_setup_done,
-#else
- .pagetable_setup_start = paravirt_nop,
- .pagetable_setup_done = paravirt_nop,
-#endif
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a..c563e4c8ff3 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -46,6 +46,7 @@
#include <asm/dma.h>
#include <asm/rio.h>
#include <asm/bios_ebda.h>
+#include <asm/x86_init.h>
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
if (panic_on_overflow)
panic("Calgary: fix the allocator.\n");
else
- return bad_dma_address;
+ return DMA_ERROR_CODE;
}
}
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
void *vaddr, unsigned int npages, int direction)
{
unsigned long entry;
- dma_addr_t ret = bad_dma_address;
+ dma_addr_t ret;
entry = iommu_range_alloc(dev, tbl, npages);
- if (unlikely(entry == bad_dma_address))
- goto error;
+ if (unlikely(entry == DMA_ERROR_CODE)) {
+ printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+ "iommu %p\n", npages, tbl);
+ return DMA_ERROR_CODE;
+ }
/* set the return dma address */
ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
/* put the TCEs in the HW table */
tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
direction);
-
return ret;
-
-error:
- printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
- "iommu %p\n", npages, tbl);
- return bad_dma_address;
}
static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned long flags;
/* were we called with bad_dma_address? */
- badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
- if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
+ if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
"address 0x%Lx\n", dma_addr);
return;
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
pdev = to_pci_dev(dev);
+ /* search up the device tree for an iommu */
pbus = pdev->bus;
-
- /* is the device behind a bridge? Look for the root bus */
- while (pbus->parent)
+ do {
+ tbl = pci_iommu(pbus);
+ if (tbl && tbl->it_busno == pbus->number)
+ break;
+ tbl = NULL;
pbus = pbus->parent;
-
- tbl = pci_iommu(pbus);
+ } while (pbus);
BUG_ON(tbl && (tbl->it_busno != pbus->number));
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == bad_dma_address) {
+ if (entry == DMA_ERROR_CODE) {
/* makes sure unmap knows to stop */
s->dma_length = 0;
goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
error:
calgary_unmap_sg(dev, sg, nelems, dir, NULL);
for_each_sg(sg, s, nelems, i) {
- sg->dma_address = bad_dma_address;
+ sg->dma_address = DMA_ERROR_CODE;
sg->dma_length = 0;
}
return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
/* set up tces to cover the allocated range */
mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == bad_dma_address)
+ if (mapping == DMA_ERROR_CODE)
goto free;
*dma_handle = mapping;
return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
struct iommu_table *tbl = pci_iommu(dev->bus);
/* reserve EMERGENCY_PAGES from bad_dma_address and up */
- iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
+ iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
/* avoid the BIOS/VGA first 640KB-1MB region */
/* for CalIOC2 - avoid the entire first MB */
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
return;
}
+static int __init calgary_iommu_init(void)
+{
+ int ret;
+
+ /* ok, we're trying to use Calgary - let's roll */
+ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+ ret = calgary_init();
+ if (ret) {
+ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+ "falling back to no_iommu\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
void __init detect_calgary(void)
{
int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
* if the user specified iommu=off or iommu=soft or we found
* another HW IOMMU already, bail out.
*/
- if (swiotlb || no_iommu || iommu_detected)
+ if (no_iommu || iommu_detected)
return;
if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
specified_table_size);
- /* swiotlb for devices that aren't behind the Calgary. */
- if (max_pfn > MAX_DMA32_PFN)
- swiotlb = 1;
+ x86_init.iommu.iommu_init = calgary_iommu_init;
}
return;
@@ -1457,35 +1472,6 @@ cleanup:
}
}
-int __init calgary_iommu_init(void)
-{
- int ret;
-
- if (no_iommu || (swiotlb && !calgary_detected))
- return -ENODEV;
-
- if (!calgary_detected)
- return -ENODEV;
-
- /* ok, we're trying to use Calgary - let's roll */
- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
- ret = calgary_init();
- if (ret) {
- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
- "falling back to no_iommu\n", ret);
- return ret;
- }
-
- force_iommu = 1;
- bad_dma_address = 0x0;
- /* dma_ops is set to swiotlb or nommu */
- if (!dma_ops)
- dma_ops = &nommu_dma_ops;
-
- return 0;
-}
-
static int __init calgary_parse_options(char *p)
{
unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d71c8655905..afcc58b69c7 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,11 @@
#include <asm/gart.h>
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
+#include <asm/x86_init.h>
static int forbid_dac __read_mostly;
-struct dma_map_ops *dma_ops;
+struct dma_map_ops *dma_ops = &nommu_dma_ops;
EXPORT_SYMBOL(dma_ops);
static int iommu_sac_force __read_mostly;
@@ -35,22 +36,17 @@ int iommu_detected __read_mostly = 0;
/*
* This variable becomes 1 if iommu=pt is passed on the kernel command line.
- * If this variable is 1, IOMMU implementations do no DMA ranslation for
+ * If this variable is 1, IOMMU implementations do no DMA translation for
* devices and allow every device to access to whole physical memory. This is
* useful if a user want to use an IOMMU only for KVM device assignment to
* guests and not for driver dma translation.
*/
int iommu_pass_through __read_mostly;
-dma_addr_t bad_dma_address __read_mostly = 0;
-EXPORT_SYMBOL(bad_dma_address);
-
-/* Dummy device used for NULL arguments (normally ISA). Better would
- be probably a smaller DMA mask, but this is bug-to-bug compatible
- to older i386. */
+/* Dummy device used for NULL arguments (normally ISA). */
struct device x86_dma_fallback_dev = {
.init_name = "fallback device",
- .coherent_dma_mask = DMA_BIT_MASK(32),
+ .coherent_dma_mask = ISA_DMA_BIT_MASK,
.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
};
EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -128,20 +124,17 @@ void __init pci_iommu_alloc(void)
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
#endif
+ if (pci_swiotlb_init())
+ return;
- /*
- * The order of these functions is important for
- * fall-back/fail-over reasons
- */
gart_iommu_hole_init();
detect_calgary();
detect_intel_iommu();
+ /* needs to be called after gart_iommu_hole_init */
amd_iommu_detect();
-
- pci_swiotlb_init();
}
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
@@ -216,7 +209,7 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "allowdac", 8))
forbid_dac = 0;
if (!strncmp(p, "nodac", 5))
- forbid_dac = -1;
+ forbid_dac = 1;
if (!strncmp(p, "usedac", 6)) {
forbid_dac = -1;
return 1;
@@ -225,10 +218,8 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "soft", 4))
swiotlb = 1;
#endif
- if (!strncmp(p, "pt", 2)) {
+ if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
- return 1;
- }
gart_parse_options(p);
@@ -293,27 +284,19 @@ static int __init pci_iommu_init(void)
#ifdef CONFIG_PCI
dma_debug_add_bus(&pci_bus_type);
#endif
+ x86_init.iommu.iommu_init();
- calgary_iommu_init();
+ if (swiotlb) {
+ printk(KERN_INFO "PCI-DMA: "
+ "Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ } else
+ swiotlb_free();
- intel_iommu_init();
-
- amd_iommu_init();
-
- gart_iommu_init();
-
- no_iommu_init();
return 0;
}
-
-void pci_iommu_shutdown(void)
-{
- gart_iommu_shutdown();
-
- amd_iommu_shutdown();
-}
/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
+rootfs_initcall(pci_iommu_init);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 98a827ee9ed..e6a0d402f17 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -16,6 +16,7 @@
#include <linux/agp_backend.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/sched.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
@@ -38,6 +39,7 @@
#include <asm/swiotlb.h>
#include <asm/dma.h>
#include <asm/k8.h>
+#include <asm/x86_init.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
@@ -45,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
static u32 *iommu_gatt_base; /* Remapping table */
+static dma_addr_t bad_dma_addr;
+
/*
* If this is disabled the IOMMU will use an optimized flushing strategy
* of only flushing when an mapping is reused. With it true the GART is
@@ -91,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
+ boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -215,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
if (panic_on_overflow)
panic("dma_map_area overflow %lu bytes\n", size);
iommu_full(dev, size, dir);
- return bad_dma_address;
+ return bad_dma_addr;
}
for (i = 0; i < npages; i++) {
@@ -293,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
int i;
#ifdef CONFIG_IOMMU_DEBUG
- printk(KERN_DEBUG "dma_map_sg overflow\n");
+ pr_debug("dma_map_sg overflow\n");
#endif
for_each_sg(sg, s, nents, i) {
@@ -301,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
if (nonforced_iommu(dev, addr, s->length)) {
addr = dma_map_area(dev, addr, s->length, dir, 0);
- if (addr == bad_dma_address) {
+ if (addr == bad_dma_addr) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir, NULL);
nents = 0;
@@ -388,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
if (!dev)
dev = &x86_dma_fallback_dev;
- out = 0;
- start = 0;
- start_sg = sgmap = sg;
- seg_size = 0;
- max_seg_size = dma_get_max_seg_size(dev);
- ps = NULL; /* shut up gcc */
+ out = 0;
+ start = 0;
+ start_sg = sg;
+ sgmap = sg;
+ seg_size = 0;
+ max_seg_size = dma_get_max_seg_size(dev);
+ ps = NULL; /* shut up gcc */
+
for_each_sg(sg, s, nents, i) {
dma_addr_t addr = sg_phys(s);
@@ -416,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
sgmap, pages, need) < 0)
goto error;
out++;
- seg_size = 0;
- sgmap = sg_next(sgmap);
- pages = 0;
- start = i;
- start_sg = s;
+
+ seg_size = 0;
+ sgmap = sg_next(sgmap);
+ pages = 0;
+ start = i;
+ start_sg = s;
}
}
@@ -454,7 +461,7 @@ error:
iommu_full(dev, pages << PAGE_SHIFT, dir);
for_each_sg(sg, s, nents, i)
- s->dma_address = bad_dma_address;
+ s->dma_address = bad_dma_addr;
return 0;
}
@@ -478,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
DMA_BIDIRECTIONAL, align_mask);
flush_gart();
- if (paddr != bad_dma_address) {
+ if (paddr != bad_dma_addr) {
*dma_addr = paddr;
return page_address(page);
}
@@ -498,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
free_pages((unsigned long)vaddr, get_order(size));
}
+static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return (dma_addr == bad_dma_addr);
+}
+
static int no_agp;
static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -514,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
if (iommu_size < 64*1024*1024) {
- printk(KERN_WARNING
+ pr_warning(
"PCI-DMA: Warning: Small IOMMU %luMB."
" Consider increasing the AGP aperture in BIOS\n",
iommu_size >> 20);
@@ -569,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
aperture_alloc = aper_alloc;
}
-static int gart_resume(struct sys_device *dev)
+static void gart_fixup_northbridges(struct sys_device *dev)
{
- printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
+ int i;
- if (fix_up_north_bridges) {
- int i;
+ if (!fix_up_north_bridges)
+ return;
- printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
+ pr_info("PCI-DMA: Restoring GART aperture settings\n");
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ for (i = 0; i < num_k8_northbridges; i++) {
+ struct pci_dev *dev = k8_northbridges[i];
- /*
- * Don't enable translations just yet. That is the next
- * step. Restore the pre-suspend aperture settings.
- */
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
- aperture_order << 1);
- pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
- aperture_alloc >> 25);
- }
+ /*
+ * Don't enable translations just yet. That is the next
+ * step. Restore the pre-suspend aperture settings.
+ */
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+ pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
}
+}
+
+static int gart_resume(struct sys_device *dev)
+{
+ pr_info("PCI-DMA: Resuming GART IOMMU\n");
+
+ gart_fixup_northbridges(dev);
enable_gart_translations();
@@ -603,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
}
static struct sysdev_class gart_sysdev_class = {
- .name = "gart",
- .suspend = gart_suspend,
- .resume = gart_resume,
+ .name = "gart",
+ .suspend = gart_suspend,
+ .resume = gart_resume,
};
static struct sys_device device_gart = {
- .id = 0,
- .cls = &gart_sysdev_class,
+ .cls = &gart_sysdev_class,
};
/*
@@ -626,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
void *gatt;
int i, error;
- printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+ pr_info("PCI-DMA: Disabling AGP.\n");
+
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
for (i = 0; i < num_k8_northbridges; i++) {
@@ -644,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
}
if (!aper_base)
goto nommu;
+
info->aper_base = aper_base;
info->aper_size = aper_size >> 20;
@@ -666,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
flush_gart();
- printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
+ pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
aper_base, aper_size>>10);
return 0;
nommu:
/* Should not happen anymore */
- printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+ pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
"falling back to iommu=soft.\n");
return -1;
}
@@ -685,14 +702,15 @@ static struct dma_map_ops gart_dma_ops = {
.unmap_page = gart_unmap_page,
.alloc_coherent = gart_alloc_coherent,
.free_coherent = gart_free_coherent,
+ .mapping_error = gart_mapping_error,
};
-void gart_iommu_shutdown(void)
+static void gart_iommu_shutdown(void)
{
struct pci_dev *dev;
int i;
- if (no_agp && (dma_ops != &gart_dma_ops))
+ if (no_agp)
return;
for (i = 0; i < num_k8_northbridges; i++) {
@@ -707,7 +725,7 @@ void gart_iommu_shutdown(void)
}
}
-void __init gart_iommu_init(void)
+int __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long iommu_start;
@@ -717,7 +735,7 @@ void __init gart_iommu_init(void)
long i;
if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
- return;
+ return 0;
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
@@ -729,35 +747,28 @@ void __init gart_iommu_init(void)
(agp_copy_info(agp_bridge, &info) < 0);
#endif
- if (swiotlb)
- return;
-
- /* Did we detect a different HW IOMMU? */
- if (iommu_detected && !gart_iommu_aperture)
- return;
-
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
(no_agp && init_k8_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
- printk(KERN_WARNING "More than 4GB of memory "
- "but GART IOMMU not available.\n");
- printk(KERN_WARNING "falling back to iommu=soft.\n");
+ pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warning("falling back to iommu=soft.\n");
}
- return;
+ return 0;
}
/* need to map that range */
- aper_size = info.aper_size << 20;
- aper_base = info.aper_base;
- end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+ aper_size = info.aper_size << 20;
+ aper_base = info.aper_base;
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+
if (end_pfn > max_low_pfn_mapped) {
start_pfn = (aper_base>>PAGE_SHIFT);
init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
}
- printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
+ pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
iommu_pages = iommu_size >> PAGE_SHIFT;
@@ -772,8 +783,7 @@ void __init gart_iommu_init(void)
ret = dma_debug_resize_entries(iommu_pages);
if (ret)
- printk(KERN_DEBUG
- "PCI-DMA: Cannot trace all the entries\n");
+ pr_debug("PCI-DMA: Cannot trace all the entries\n");
}
#endif
@@ -783,15 +793,14 @@ void __init gart_iommu_init(void)
*/
iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
- agp_memory_reserved = iommu_size;
- printk(KERN_INFO
- "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+ pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
iommu_size >> 20);
- iommu_start = aper_size - iommu_size;
- iommu_bus_base = info.aper_base + iommu_start;
- bad_dma_address = iommu_bus_base;
- iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+ agp_memory_reserved = iommu_size;
+ iommu_start = aper_size - iommu_size;
+ iommu_bus_base = info.aper_base + iommu_start;
+ bad_dma_addr = iommu_bus_base;
+ iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
/*
* Unmap the IOMMU part of the GART. The alias of the page is
@@ -813,7 +822,7 @@ void __init gart_iommu_init(void)
* the pages as Not-Present:
*/
wbinvd();
-
+
/*
* Now all caches are flushed and we can safely enable
* GART hardware. Doing it early leaves the possibility
@@ -837,6 +846,10 @@ void __init gart_iommu_init(void)
flush_gart();
dma_ops = &gart_dma_ops;
+ x86_platform.iommu_shutdown = gart_iommu_shutdown;
+ swiotlb = 0;
+
+ return 0;
}
void __init gart_parse_options(char *p)
@@ -855,7 +868,7 @@ void __init gart_parse_options(char *p)
#endif
if (isdigit(*p) && get_option(&p, &arg))
iommu_size = arg;
- if (!strncmp(p, "fullflush", 8))
+ if (!strncmp(p, "fullflush", 9))
iommu_fullflush = 1;
if (!strncmp(p, "nofullflush", 11))
iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330c..22be12b60a8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
dma_addr_t bus = page_to_phys(page) + offset;
WARN_ON(size == 0);
if (!check_addr("map_single", dev, bus, size))
- return bad_dma_address;
+ return DMA_ERROR_CODE;
flush_write_buffers();
return bus;
}
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
.sync_sg_for_device = nommu_sync_sg_for_device,
.is_phys = 1,
};
-
-void __init no_iommu_init(void)
-{
- if (dma_ops)
- return;
-
- force_iommu = 0; /* no HW IOMMU */
- dma_ops = &nommu_dma_ops;
-}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index e8a35016115..e3c0a66b9e7 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,19 +42,28 @@ static struct dma_map_ops swiotlb_dma_ops = {
.dma_supported = NULL,
};
-void __init pci_swiotlb_init(void)
+/*
+ * pci_swiotlb_init - initialize swiotlb if necessary
+ *
+ * This returns non-zero if we are forced to use swiotlb (by the boot
+ * option).
+ */
+int __init pci_swiotlb_init(void)
{
+ int use_swiotlb = swiotlb | swiotlb_force;
+
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
- iommu_pass_through)
- swiotlb = 1;
+ if (!no_iommu && max_pfn > MAX_DMA32_PFN)
+ swiotlb = 1;
#endif
if (swiotlb_force)
swiotlb = 1;
+
if (swiotlb) {
- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_init();
+ swiotlb_init(0);
dma_ops = &swiotlb_dma_ops;
}
+
+ return use_swiotlb;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a4ba8..744508e7cfd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,8 @@
#include <linux/pm.h>
#include <linux/clockchips.h>
#include <linux/random.h>
-#include <trace/power.h>
+#include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
#include <asm/system.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
@@ -17,6 +18,7 @@
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/ds.h>
+#include <asm/debugreg.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -25,9 +27,6 @@ EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
-DEFINE_TRACE(power_start);
-DEFINE_TRACE(power_end);
-
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
*dst = *src;
@@ -106,14 +105,7 @@ void flush_thread(void)
}
#endif
- clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
+ flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
/*
* Forget coprocessor state..
@@ -195,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- set_debugreg(next->debugreg0, 0);
- set_debugreg(next->debugreg1, 1);
- set_debugreg(next->debugreg2, 2);
- set_debugreg(next->debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(next->debugreg6, 6);
- set_debugreg(next->debugreg7, 7);
- }
-
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */
@@ -299,9 +281,7 @@ static inline int hlt_use_halt(void)
void default_idle(void)
{
if (hlt_use_halt()) {
- struct power_trace it;
-
- trace_power_start(&it, POWER_CSTATE, 1);
+ trace_power_start(POWER_CSTATE, 1);
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -314,7 +294,6 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
- trace_power_end(&it);
} else {
local_irq_enable();
/* loop is done by the caller */
@@ -372,9 +351,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
- struct power_trace it;
-
- trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
+ trace_power_start(POWER_CSTATE, (ax>>4)+1);
if (!need_resched()) {
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -384,15 +361,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
if (!need_resched())
__mwait(ax, cx);
}
- trace_power_end(&it);
}
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
- struct power_trace it;
if (!need_resched()) {
- trace_power_start(&it, POWER_CSTATE, 1);
+ trace_power_start(POWER_CSTATE, 1);
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -402,7 +377,6 @@ static void mwait_idle(void)
__sti_mwait(0, 0);
else
local_irq_enable();
- trace_power_end(&it);
} else
local_irq_enable();
}
@@ -414,13 +388,11 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
- struct power_trace it;
-
- trace_power_start(&it, POWER_CSTATE, 0);
+ trace_power_start(POWER_CSTATE, 0);
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end(&it);
+ trace_power_end(0);
}
/*
@@ -568,10 +540,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
void __init init_c1e_mask(void)
{
/* If we're using c1e_idle, we need to allocate c1e_mask. */
- if (pm_idle == c1e_idle) {
- alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
- cpumask_clear(c1e_mask);
- }
+ if (pm_idle == c1e_idle)
+ zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
}
static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cda..540140284f6 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,6 +58,7 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/ds.h>
+#include <asm/debugreg.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -134,7 +135,7 @@ void __show_regs(struct pt_regs *regs, int all)
ss = regs->ss & 0xffff;
gs = get_user_gs(regs);
} else {
- sp = (unsigned long) (&regs->sp);
+ sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
savesegment(gs, gs);
}
@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
task_user_gs(p) = get_user_gs(regs);
+ p->thread.io_bitmap_ptr = NULL;
tsk = current;
+ err = -ENOMEM;
+
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b68317..70cf15873f3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,6 +52,7 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/ds.h>
+#include <asm/debugreg.h>
asmlinkage extern void ret_from_fork(void);
@@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.fs = me->thread.fs;
p->thread.gs = me->thread.gs;
+ p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
savesegment(fs, p->thread.fsindex);
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
+ err = -ENOMEM;
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
@@ -341,6 +346,7 @@ out:
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
+
return err;
}
@@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
if (preload_fpu)
__math_state_restore();
+
return prev_p;
}
@@ -664,3 +671,8 @@ long sys_arch_prctl(int code, unsigned long addr)
return do_arch_prctl(current, code, addr);
}
+unsigned long KSTK_ESP(struct task_struct *task)
+{
+ return (test_tsk_thread_flag(task, TIF_IA32)) ?
+ (task_pt_regs(task)->sp) : ((task)->thread.usersp);
+}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8d7d5c9c1be..04d182a7cfd 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/workqueue.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
#include <asm/prctl.h>
#include <asm/proto.h>
#include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
#include "tls.h"
@@ -49,6 +52,118 @@ enum x86_regset {
REGSET_IOPERM32,
};
+struct pt_regs_offset {
+ const char *name;
+ int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+ REG_OFFSET_NAME(r15),
+ REG_OFFSET_NAME(r14),
+ REG_OFFSET_NAME(r13),
+ REG_OFFSET_NAME(r12),
+ REG_OFFSET_NAME(r11),
+ REG_OFFSET_NAME(r10),
+ REG_OFFSET_NAME(r9),
+ REG_OFFSET_NAME(r8),
+#endif
+ REG_OFFSET_NAME(bx),
+ REG_OFFSET_NAME(cx),
+ REG_OFFSET_NAME(dx),
+ REG_OFFSET_NAME(si),
+ REG_OFFSET_NAME(di),
+ REG_OFFSET_NAME(bp),
+ REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+ REG_OFFSET_NAME(ds),
+ REG_OFFSET_NAME(es),
+ REG_OFFSET_NAME(fs),
+ REG_OFFSET_NAME(gs),
+#endif
+ REG_OFFSET_NAME(orig_ax),
+ REG_OFFSET_NAME(ip),
+ REG_OFFSET_NAME(cs),
+ REG_OFFSET_NAME(flags),
+ REG_OFFSET_NAME(sp),
+ REG_OFFSET_NAME(ss),
+ REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name: the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->offset;
+ return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset: the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (roff->offset == offset)
+ return roff->name;
+ return NULL;
+}
+
+static const int arg_offs_table[] = {
+#ifdef CONFIG_X86_32
+ [0] = offsetof(struct pt_regs, ax),
+ [1] = offsetof(struct pt_regs, dx),
+ [2] = offsetof(struct pt_regs, cx)
+#else /* CONFIG_X86_64 */
+ [0] = offsetof(struct pt_regs, di),
+ [1] = offsetof(struct pt_regs, si),
+ [2] = offsetof(struct pt_regs, dx),
+ [3] = offsetof(struct pt_regs, cx),
+ [4] = offsetof(struct pt_regs, r8),
+ [5] = offsetof(struct pt_regs, r9)
+#endif
+};
+
+/**
+ * regs_get_argument_nth() - get Nth argument at function call
+ * @regs: pt_regs which contains registers at function entry.
+ * @n: argument number.
+ *
+ * regs_get_argument_nth() returns @n th argument of a function call.
+ * Since usually the kernel stack will be changed right after function entry,
+ * you must use this at function entry. If the @n th entry is NOT in the
+ * kernel stack or pt_regs, this returns 0.
+ */
+unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
+{
+ if (n < ARRAY_SIZE(arg_offs_table))
+ return *(unsigned long *)((char *)regs + arg_offs_table[n]);
+ else {
+ /*
+ * The typical case: arg n is on the stack.
+ * (Note: stack[0] = return address, so skip it)
+ */
+ n -= ARRAY_SIZE(arg_offs_table);
+ return regs_get_kernel_stack_nth(regs, 1 + n);
+ }
+}
+
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
return 0;
}
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
- return TASK_SIZE - 3;
-}
-
#else /* CONFIG_X86_64 */
#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
return 0;
}
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- return IA32_PAGE_OFFSET - 3;
-#endif
- return TASK_SIZE_MAX - 7;
-}
-
#endif /* CONFIG_X86_32 */
static unsigned long get_flags(struct task_struct *task)
@@ -325,16 +426,6 @@ static int putreg(struct task_struct *child,
return set_flags(child, value);
#ifdef CONFIG_X86_64
- /*
- * Orig_ax is really just a flag with small positive and
- * negative values, so make sure to always sign-extend it
- * from 32 bits so that it works correctly regardless of
- * whether we come from a 32-bit environment or not.
- */
- case offsetof(struct user_regs_struct, orig_ax):
- value = (long) (s32) value;
- break;
-
case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_OF(child))
return -EIO;
@@ -464,99 +555,239 @@ static int genregs_set(struct task_struct *target,
return ret;
}
+static void ptrace_triggered(struct perf_event *bp, void *data)
+{
+ int i;
+ struct thread_struct *thread = &(current->thread);
+
+ /*
+ * Store in the virtual DR6 register the fact that the breakpoint
+ * was hit so the thread's debugger will see it.
+ */
+ for (i = 0; i < HBP_NUM; i++) {
+ if (thread->ptrace_bps[i] == bp)
+ break;
+ }
+
+ thread->debugreg6 |= (DR_TRAP0 << i);
+}
+
/*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
*/
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
{
- switch (n) {
- case 0: return child->thread.debugreg0;
- case 1: return child->thread.debugreg1;
- case 2: return child->thread.debugreg2;
- case 3: return child->thread.debugreg3;
- case 6: return child->thread.debugreg6;
- case 7: return child->thread.debugreg7;
+ int i;
+ int dr7 = 0;
+ struct arch_hw_breakpoint *info;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ if (bp[i] && !bp[i]->attr.disabled) {
+ info = counter_arch_bp(bp[i]);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ }
}
- return 0;
+
+ return dr7;
}
-static int ptrace_set_debugreg(struct task_struct *child,
- int n, unsigned long data)
+static struct perf_event *
+ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+ struct task_struct *tsk, int disabled)
{
- int i;
+ int err;
+ int gen_len, gen_type;
+ DEFINE_BREAKPOINT_ATTR(attr);
- if (unlikely(n == 4 || n == 5))
- return -EIO;
+ /*
+ * We shoud have at least an inactive breakpoint at this
+ * slot. It means the user is writing dr7 without having
+ * written the address register first
+ */
+ if (!bp)
+ return ERR_PTR(-EINVAL);
- if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
- return -EIO;
+ err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+ if (err)
+ return ERR_PTR(err);
- switch (n) {
- case 0: child->thread.debugreg0 = data; break;
- case 1: child->thread.debugreg1 = data; break;
- case 2: child->thread.debugreg2 = data; break;
- case 3: child->thread.debugreg3 = data; break;
+ attr = bp->attr;
+ attr.bp_len = gen_len;
+ attr.bp_type = gen_type;
+ attr.disabled = disabled;
- case 6:
- if ((data & ~0xffffffffUL) != 0)
- return -EIO;
- child->thread.debugreg6 = data;
- break;
+ return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
+}
+
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+ struct thread_struct *thread = &(tsk->thread);
+ unsigned long old_dr7;
+ int i, orig_ret = 0, rc = 0;
+ int enabled, second_pass = 0;
+ unsigned len, type;
+ struct perf_event *bp;
+
+ data &= ~DR_CONTROL_RESERVED;
+ old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+restore:
+ /*
+ * Loop through all the hardware breakpoints, making the
+ * appropriate changes to each.
+ */
+ for (i = 0; i < HBP_NUM; i++) {
+ enabled = decode_dr7(data, i, &len, &type);
+ bp = thread->ptrace_bps[i];
+
+ if (!enabled) {
+ if (bp) {
+ /*
+ * Don't unregister the breakpoints right-away,
+ * unless all register_user_hw_breakpoint()
+ * requests have succeeded. This prevents
+ * any window of opportunity for debug
+ * register grabbing by other users.
+ */
+ if (!second_pass)
+ continue;
+
+ thread->ptrace_bps[i] = NULL;
+ bp = ptrace_modify_breakpoint(bp, len, type,
+ tsk, 1);
+ if (IS_ERR(bp)) {
+ rc = PTR_ERR(bp);
+ thread->ptrace_bps[i] = NULL;
+ break;
+ }
+ thread->ptrace_bps[i] = bp;
+ }
+ continue;
+ }
- case 7:
+ bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+
+ /* Incorrect bp, or we have a bug in bp API */
+ if (IS_ERR(bp)) {
+ rc = PTR_ERR(bp);
+ thread->ptrace_bps[i] = NULL;
+ break;
+ }
+ thread->ptrace_bps[i] = bp;
+ }
+ /*
+ * Make a second pass to free the remaining unused breakpoints
+ * or to restore the original breakpoints if an error occurred.
+ */
+ if (!second_pass) {
+ second_pass = 1;
+ if (rc < 0) {
+ orig_ret = rc;
+ data = old_dr7;
+ }
+ goto restore;
+ }
+ return ((orig_ret < 0) ? orig_ret : rc);
+}
+
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+ struct thread_struct *thread = &(tsk->thread);
+ unsigned long val = 0;
+
+ if (n < HBP_NUM) {
+ struct perf_event *bp;
+ bp = thread->ptrace_bps[n];
+ if (!bp)
+ return 0;
+ val = bp->hw.info.address;
+ } else if (n == 6) {
+ val = thread->debugreg6;
+ } else if (n == 7) {
+ val = ptrace_get_dr7(thread->ptrace_bps);
+ }
+ return val;
+}
+
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+ unsigned long addr)
+{
+ struct perf_event *bp;
+ struct thread_struct *t = &tsk->thread;
+ DEFINE_BREAKPOINT_ATTR(attr);
+
+ if (!t->ptrace_bps[nr]) {
/*
- * Sanity-check data. Take one half-byte at once with
- * check = (val >> (16 + 4*i)) & 0xf. It contains the
- * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
- * 2 and 3 are LENi. Given a list of invalid values,
- * we do mask |= 1 << invalid_value, so that
- * (mask >> check) & 1 is a correct test for invalid
- * values.
- *
- * R/Wi contains the type of the breakpoint /
- * watchpoint, LENi contains the length of the watched
- * data in the watchpoint case.
- *
- * The invalid values are:
- * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
- * - R/Wi == 0x10 (break on I/O reads or writes), so
- * mask |= 0x4444.
- * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
- * 0x1110.
- *
- * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
- *
- * See the Intel Manual "System Programming Guide",
- * 15.2.4
- *
- * Note that LENi == 0x10 is defined on x86_64 in long
- * mode (i.e. even for 32-bit userspace software, but
- * 64-bit kernel), so the x86_64 mask value is 0x5454.
- * See the AMD manual no. 24593 (AMD64 System Programming)
+ * Put stub len and type to register (reserve) an inactive but
+ * correct bp
*/
-#ifdef CONFIG_X86_32
-#define DR7_MASK 0x5f54
-#else
-#define DR7_MASK 0x5554
-#endif
- data &= ~DR_CONTROL_RESERVED;
- for (i = 0; i < 4; i++)
- if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
- return -EIO;
- child->thread.debugreg7 = data;
- if (data)
- set_tsk_thread_flag(child, TIF_DEBUG);
- else
- clear_tsk_thread_flag(child, TIF_DEBUG);
- break;
+ attr.bp_addr = addr;
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_W;
+ attr.disabled = 1;
+
+ bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+ } else {
+ bp = t->ptrace_bps[nr];
+ t->ptrace_bps[nr] = NULL;
+
+ attr = bp->attr;
+ attr.bp_addr = addr;
+ bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
}
+ /*
+ * CHECKME: the previous code returned -EIO if the addr wasn't a
+ * valid task virtual addr. The new one will return -EINVAL in this
+ * case.
+ * -EINVAL may be what we want for in-kernel breakpoints users, but
+ * -EIO looks better for ptrace, since we refuse a register writing
+ * for the user. And anyway this is the previous behaviour.
+ */
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ t->ptrace_bps[nr] = bp;
return 0;
}
/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+ struct thread_struct *thread = &(tsk->thread);
+ int rc = 0;
+
+ /* There are no DR4 or DR5 registers */
+ if (n == 4 || n == 5)
+ return -EIO;
+
+ if (n == 6) {
+ thread->debugreg6 = val;
+ goto ret_path;
+ }
+ if (n < HBP_NUM) {
+ rc = ptrace_set_breakpoint_addr(tsk, n, val);
+ if (rc)
+ return rc;
+ }
+ /* All that's left is DR7 */
+ if (n == 7)
+ rc = ptrace_write_dr7(tsk, val);
+
+ret_path:
+ return rc;
+}
+
+/*
* These access the current or another (stopped) task's io permission
* bitmap for debugging or core dump.
*/
@@ -1126,10 +1357,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
case offsetof(struct user32, regs.orig_eax):
/*
- * Sign-extend the value so that orig_eax = -1
- * causes (long)orig_ax < 0 tests to fire correctly.
+ * A 32-bit debugger setting orig_eax means to restore
+ * the state of the task restarting a 32-bit syscall.
+ * Make sure we interpret the -ERESTART* codes correctly
+ * in case the task is not actually still sitting at the
+ * exit from a 32-bit syscall with TS_COMPAT still set.
*/
- regs->orig_ax = (long) (s32) value;
+ regs->orig_ax = value;
+ if (syscall_get_nr(child, regs) >= 0)
+ task_thread_info(child)->status |= TS_COMPAT;
break;
case offsetof(struct user32, regs.eflags):
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624b..6c3b2c6fd77 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
pci_read_config_dword(nb_ht, 0x60, &val);
set_dev_node(&dev->dev, val & 7);
- pci_dev_put(dev);
+ pci_dev_put(nb_ht);
}
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d10184..2b97fc5b124 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,8 @@
#include <linux/pm.h>
#include <linux/efi.h>
#include <linux/dmi.h>
+#include <linux/sched.h>
+#include <linux/tboot.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -21,7 +23,7 @@
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
#else
-# include <asm/iommu.h>
+# include <asm/x86_init.h>
#endif
/*
@@ -434,6 +436,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
},
},
+ { /* Handle problems with rebooting on Apple Macmini3,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple Macmini3,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ },
+ },
{ }
};
@@ -508,6 +518,8 @@ static void native_machine_emergency_restart(void)
if (reboot_emergency)
emergency_vmx_disable_all();
+ tboot_shutdown(TB_SHUTDOWN_REBOOT);
+
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
@@ -610,7 +622,7 @@ void native_machine_shutdown(void)
#endif
#ifdef CONFIG_X86_64
- pci_iommu_shutdown();
+ x86_platform.iommu_shutdown();
#endif
}
@@ -634,6 +646,8 @@ static void native_machine_halt(void)
/* stop other cpus and apics */
machine_shutdown();
+ tboot_shutdown(TB_SHUTDOWN_HALT);
+
/* stop this cpu */
stop_this_cpu(NULL);
}
@@ -645,6 +659,8 @@ static void native_machine_power_off(void)
machine_shutdown();
pm_power_off();
}
+ /* a fallback in case there is no PM info available */
+ tboot_shutdown(TB_SHUTDOWN_HALT);
}
struct machine_ops machine_ops = {
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e7..1cfbbfc3ae2 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -8,6 +8,7 @@
#include <linux/pnp.h>
#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
#include <asm/time.h>
#ifdef CONFIG_X86_32
@@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
}
EXPORT_SYMBOL(rtc_cmos_write);
-static int set_rtc_mmss(unsigned long nowtime)
+int update_persistent_clock(struct timespec now)
{
unsigned long flags;
int retval;
spin_lock_irqsave(&rtc_lock, flags);
- retval = set_wallclock(nowtime);
+ retval = x86_platform.set_wallclock(now.tv_sec);
spin_unlock_irqrestore(&rtc_lock, flags);
return retval;
}
/* not static: needed by APM */
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
{
unsigned long retval, flags;
spin_lock_irqsave(&rtc_lock, flags);
- retval = get_wallclock();
+ retval = x86_platform.get_wallclock();
spin_unlock_irqrestore(&rtc_lock, flags);
- return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
- return set_rtc_mmss(now.tv_sec);
+ ts->tv_sec = retval;
+ ts->tv_nsec = 0;
}
unsigned long long native_read_tsc(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef..82e88cdda9b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -27,6 +27,7 @@
#include <linux/screen_info.h>
#include <linux/ioport.h>
#include <linux/acpi.h>
+#include <linux/sfi.h>
#include <linux/apm_bios.h>
#include <linux/initrd.h>
#include <linux/bootmem.h>
@@ -66,6 +67,7 @@
#include <linux/percpu.h>
#include <linux/crash_dump.h>
+#include <linux/tboot.h>
#include <video/edid.h>
@@ -107,10 +109,7 @@
#ifdef CONFIG_X86_64
#include <asm/numa_64.h>
#endif
-
-#ifndef ARCH_SETUP
-#define ARCH_SETUP
-#endif
+#include <asm/mce.h>
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -133,9 +132,9 @@ int default_cpu_present_to_apicid(int mps_cpu)
return __default_cpu_present_to_apicid(mps_cpu);
}
-int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+int default_check_phys_apicid_present(int phys_apicid)
{
- return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+ return __default_check_phys_apicid_present(phys_apicid);
}
#endif
@@ -171,13 +170,6 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
-static struct resource video_ram_resource = {
- .name = "Video RAM area",
- .start = 0xa0000,
- .end = 0xbffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
/* cpu data as detected by the assembly code in head.S */
struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
/* common cpu data for all cpus */
@@ -256,7 +248,7 @@ EXPORT_SYMBOL(edd);
* from boot_params into a safe place.
*
*/
-static inline void copy_edd(void)
+static inline void __init copy_edd(void)
{
memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
sizeof(edd.mbr_signature));
@@ -265,7 +257,7 @@ static inline void copy_edd(void)
edd.edd_info_nr = boot_params.eddbuf_entries;
}
#else
-static inline void copy_edd(void)
+static inline void __init copy_edd(void)
{
}
#endif
@@ -605,7 +597,7 @@ static struct resource standard_io_resources[] = {
.flags = IORESOURCE_BUSY | IORESOURCE_IO }
};
-static void __init reserve_standard_io_resources(void)
+void __init reserve_standard_io_resources(void)
{
int i;
@@ -637,10 +629,6 @@ static int __init setup_elfcorehdr(char *arg)
early_param("elfcorehdr", setup_elfcorehdr);
#endif
-static struct x86_quirks default_x86_quirks __initdata;
-
-struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
-
#ifdef CONFIG_X86_RESERVE_LOW_64K
static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
{
@@ -673,6 +661,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
},
},
{
+ .callback = dmi_low_memory_corruption,
+ .ident = "Phoenix/MSC BIOS",
+ .matches = {
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
+ },
+ },
+ {
/*
* AMI BIOS with low memory corruption was found on Intel DG45ID board.
* It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
@@ -757,7 +752,7 @@ void __init setup_arch(char **cmdline_p)
}
#endif
- ARCH_SETUP
+ x86_init.oem.arch_setup();
setup_memory_map();
parse_setup_data();
@@ -796,6 +791,16 @@ void __init setup_arch(char **cmdline_p)
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
+#ifdef CONFIG_X86_64
+ /*
+ * Must call this twice: Once just to detect whether hardware doesn't
+ * support NX (so that the early EHCI debug console setup can safely
+ * call set_fixmap(), and then again after parsing early parameters to
+ * honor the respective command line option.
+ */
+ check_efer();
+#endif
+
parse_early_param();
#ifdef CONFIG_X86_64
@@ -833,11 +838,9 @@ void __init setup_arch(char **cmdline_p)
* VMware detection requires dmi to be available, so this
* needs to be done after dmi_scan_machine, for the BP.
*/
- init_hypervisor(&boot_cpu_data);
+ init_hypervisor_platform();
-#ifdef CONFIG_X86_32
- probe_roms();
-#endif
+ x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
insert_resource(&iomem_resource, &code_resource);
@@ -972,10 +975,11 @@ void __init setup_arch(char **cmdline_p)
kvmclock_init();
#endif
- paravirt_pagetable_setup_start(swapper_pg_dir);
+ x86_init.paging.pagetable_setup_start(swapper_pg_dir);
paging_init();
- paravirt_pagetable_setup_done(swapper_pg_dir);
- paravirt_post_allocator_init();
+ x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+
+ tboot_probe();
#ifdef CONFIG_X86_64
map_vsyscall();
@@ -990,13 +994,13 @@ void __init setup_arch(char **cmdline_p)
*/
acpi_boot_init();
-#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+ sfi_init();
+
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
get_smp_config();
-#endif
prefill_possible_map();
@@ -1015,10 +1019,7 @@ void __init setup_arch(char **cmdline_p)
e820_reserve_resources();
e820_mark_nosave_regions(max_low_pfn);
-#ifdef CONFIG_X86_32
- request_resource(&iomem_resource, &video_ram_resource);
-#endif
- reserve_standard_io_resources();
+ x86_init.resources.reserve_resources();
e820_setup_gap();
@@ -1030,78 +1031,24 @@ void __init setup_arch(char **cmdline_p)
conswitchp = &dummy_con;
#endif
#endif
-}
+ x86_init.oem.banner();
-#ifdef CONFIG_X86_32
-
-/**
- * x86_quirk_intr_init - post gate setup interrupt initialisation
- *
- * Description:
- * Fill in any interrupts that may have been left out by the general
- * init_IRQ() routine. interrupts having to do with the machine rather
- * than the devices on the I/O bus (like APIC interrupts in intel MP
- * systems) are started here.
- **/
-void __init x86_quirk_intr_init(void)
-{
- if (x86_quirks->arch_intr_init) {
- if (x86_quirks->arch_intr_init())
- return;
- }
+ mcheck_init();
}
-/**
- * x86_quirk_trap_init - initialise system specific traps
- *
- * Description:
- * Called as the final act of trap_init(). Used in VISWS to initialise
- * the various board specific APIC traps.
- **/
-void __init x86_quirk_trap_init(void)
-{
- if (x86_quirks->arch_trap_init) {
- if (x86_quirks->arch_trap_init())
- return;
- }
-}
+#ifdef CONFIG_X86_32
-static struct irqaction irq0 = {
- .handler = timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
- .name = "timer"
+static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
-/**
- * x86_quirk_pre_time_init - do any specific initialisations before.
- *
- **/
-void __init x86_quirk_pre_time_init(void)
+void __init i386_reserve_resources(void)
{
- if (x86_quirks->arch_pre_time_init)
- x86_quirks->arch_pre_time_init();
+ request_resource(&iomem_resource, &video_ram_resource);
+ reserve_standard_io_resources();
}
-/**
- * x86_quirk_time_init - do any specific initialisations for the system timer.
- *
- * Description:
- * Must plug the system timer interrupt source at HZ into the IRQ listed
- * in irq_vectors.h:TIMER_IRQ
- **/
-void __init x86_quirk_time_init(void)
-{
- if (x86_quirks->arch_time_init) {
- /*
- * A nonzero return code does not mean failure, it means
- * that the architecture quirk does not want any
- * generic (timer) setup to be performed after this:
- */
- if (x86_quirks->arch_time_init())
- return;
- }
-
- irq0.mask = cpumask_of_cpu(0);
- setup_irq(0, &irq0);
-}
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f21..d559af913e1 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
#define PERCPU_FIRST_CHUNK_RESERVE 0
#endif
+#ifdef CONFIG_X86_32
/**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
*
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
#endif
return false;
}
+#endif
/**
* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
}
/*
- * Large page remap allocator
- *
- * This allocator uses PMD page as unit. A PMD page is allocated for
- * each cpu and each is remapped into vmalloc area using PMD mapping.
- * As PMD page is quite large, only part of it is used for the first
- * chunk. Unused part is returned to the bootmem allocator.
- *
- * So, the PMD pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk. The double
- * mapping does add one more PMD TLB entry pressure but still is much
- * better than only using 4k mappings while still being NUMA friendly.
+ * Helpers for first chunk memory allocation
*/
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pcpul_ent {
- unsigned int cpu;
- void *ptr;
-};
-
-static size_t pcpul_size;
-static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
-
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
{
- size_t off = (size_t)pageno << PAGE_SHIFT;
-
- if (off >= pcpul_size)
- return NULL;
-
- return virt_to_page(pcpul_map[cpu].ptr + off);
+ return pcpu_alloc_bootmem(cpu, size, align);
}
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
+static void __init pcpu_fc_free(void *ptr, size_t size)
{
- size_t map_size, dyn_size;
- unsigned int cpu;
- int i, j;
- ssize_t ret;
-
- if (!chosen) {
- size_t vm_size = VMALLOC_END - VMALLOC_START;
- size_t tot_size = nr_cpu_ids * PMD_SIZE;
-
- /* on non-NUMA, embedding is better */
- if (!pcpu_need_numa())
- return -EINVAL;
-
- /* don't consume more than 20% of vmalloc area */
- if (tot_size > vm_size / 5) {
- pr_info("PERCPU: too large chunk size %zuMB for "
- "large page remap\n", tot_size >> 20);
- return -EINVAL;
- }
- }
-
- /* need PSE */
- if (!cpu_has_pse) {
- pr_warning("PERCPU: lpage allocator requires PSE\n");
- return -EINVAL;
- }
-
- /*
- * Currently supports only single page. Supporting multiple
- * pages won't be too difficult if it ever becomes necessary.
- */
- pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- PERCPU_DYNAMIC_RESERVE);
- if (pcpul_size > PMD_SIZE) {
- pr_warning("PERCPU: static data is larger than large page, "
- "can't use large page\n");
- return -EINVAL;
- }
- dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
- /* allocate pointer array and alloc large pages */
- map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
- pcpul_map = alloc_bootmem(map_size);
-
- for_each_possible_cpu(cpu) {
- pcpul_map[cpu].cpu = cpu;
- pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
- PMD_SIZE);
- if (!pcpul_map[cpu].ptr) {
- pr_warning("PERCPU: failed to allocate large page "
- "for cpu%u\n", cpu);
- goto enomem;
- }
-
- /*
- * Only use pcpul_size bytes and give back the rest.
- *
- * Ingo: The 2MB up-rounding bootmem is needed to make
- * sure the partial 2MB page is still fully RAM - it's
- * not well-specified to have a PAT-incompatible area
- * (unmapped RAM, device memory, etc.) in that hole.
- */
- free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
- PMD_SIZE - pcpul_size);
-
- memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
- }
-
- /* allocate address and map */
- pcpul_vm.flags = VM_ALLOC;
- pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
- vm_area_register_early(&pcpul_vm, PMD_SIZE);
-
- for_each_possible_cpu(cpu) {
- pmd_t *pmd, pmd_v;
-
- pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
- cpu * PMD_SIZE);
- pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, pmd_v);
- }
-
- /* we're ready, commit */
- pr_info("PERCPU: Remapped at %p with large pages, static data "
- "%zu bytes\n", pcpul_vm.addr, static_size);
-
- ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
- PMD_SIZE, pcpul_vm.addr, NULL);
-
- /* sort pcpul_map array for pcpu_lpage_remapped() */
- for (i = 0; i < nr_cpu_ids - 1; i++)
- for (j = i + 1; j < nr_cpu_ids; j++)
- if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
- struct pcpul_ent tmp = pcpul_map[i];
- pcpul_map[i] = pcpul_map[j];
- pcpul_map[j] = tmp;
- }
-
- return ret;
-
-enomem:
- for_each_possible_cpu(cpu)
- if (pcpul_map[cpu].ptr)
- free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
- free_bootmem(__pa(pcpul_map), map_size);
- return -ENOMEM;
+ free_bootmem(__pa(ptr), size);
}
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area. This is
- * used by pageattr to detect VM aliases and break up the pcpu PMD
- * mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used PMD
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
- void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
- unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
- int left = 0, right = nr_cpu_ids - 1;
- int pos;
-
- /* pcpul in use at all? */
- if (!pcpul_map)
- return NULL;
-
- /* okay, perform binary search */
- while (left <= right) {
- pos = (left + right) / 2;
-
- if (pcpul_map[pos].ptr < pmd_addr)
- left = pos + 1;
- else if (pcpul_map[pos].ptr > pmd_addr)
- right = pos - 1;
- else {
- /* it shouldn't be in the area for the first chunk */
- WARN_ON(offset < pcpul_size);
-
- return pcpul_vm.addr +
- pcpul_map[pos].cpu * PMD_SIZE + offset;
- }
- }
-
- return NULL;
-}
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ if (early_cpu_to_node(from) == early_cpu_to_node(to))
+ return LOCAL_DISTANCE;
+ else
+ return REMOTE_DISTANCE;
#else
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
-{
- return -EINVAL;
-}
+ return LOCAL_DISTANCE;
#endif
-
-/*
- * Embedding allocator
- *
- * The first chunk is sized to just contain the static area plus
- * module and dynamic reserves and embedded into linear physical
- * mapping so that it can use PMD mapping without additional TLB
- * pressure.
- */
-static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
-{
- size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-
- /*
- * If large page isn't supported, there's no benefit in doing
- * this. Also, embedding allocation doesn't play well with
- * NUMA.
- */
- if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
- return -EINVAL;
-
- return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
- reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
}
-/*
- * 4k page allocator
- *
- * This is the basic allocator. Static percpu area is allocated
- * page-by-page and most of initialization is done by the generic
- * setup function.
- */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_nr_static_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
- if (pageno < pcpu4k_nr_static_pages)
- return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
- return NULL;
-}
-
-static void __init pcpu4k_populate_pte(unsigned long addr)
+static void __init pcpup_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
-static ssize_t __init setup_pcpu_4k(size_t static_size)
-{
- size_t pages_size;
- unsigned int cpu;
- int i, j;
- ssize_t ret;
-
- pcpu4k_nr_static_pages = PFN_UP(static_size);
-
- /* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
- * sizeof(pcpu4k_pages[0]));
- pcpu4k_pages = alloc_bootmem(pages_size);
-
- /* allocate and copy */
- j = 0;
- for_each_possible_cpu(cpu)
- for (i = 0; i < pcpu4k_nr_static_pages; i++) {
- void *ptr;
-
- ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
- if (!ptr) {
- pr_warning("PERCPU: failed to allocate "
- "4k page for cpu%u\n", cpu);
- goto enomem;
- }
-
- memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
- pcpu4k_pages[j++] = virt_to_page(ptr);
- }
-
- /* we're ready, commit */
- pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
- pcpu4k_nr_static_pages, static_size);
-
- ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, -1,
- -1, NULL, pcpu4k_populate_pte);
- goto out_free_ar;
-
-enomem:
- while (--j >= 0)
- free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
- ret = -ENOMEM;
-out_free_ar:
- free_bootmem(__pa(pcpu4k_pages), pages_size);
- return ret;
-}
-
-/* for explicit first chunk allocator selection */
-static char pcpu_chosen_alloc[16] __initdata;
-
-static int __init percpu_alloc_setup(char *str)
-{
- strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
- return 0;
-}
-early_param("percpu_alloc", percpu_alloc_setup);
-
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
void __init setup_per_cpu_areas(void)
{
- size_t static_size = __per_cpu_end - __per_cpu_start;
unsigned int cpu;
unsigned long delta;
- size_t pcpu_unit_size;
- ssize_t ret;
+ int rc;
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
- * Allocate percpu area. If PSE is supported, try to make use
- * of large page mappings. Please read comments on top of
- * each allocator for details.
+ * Allocate percpu area. Embedding allocator is our favorite;
+ * however, on NUMA configurations, it can result in very
+ * sparse unit mapping and vmalloc area isn't spacious enough
+ * on 32bit. Use page in that case.
*/
- ret = -EINVAL;
- if (strlen(pcpu_chosen_alloc)) {
- if (strcmp(pcpu_chosen_alloc, "4k")) {
- if (!strcmp(pcpu_chosen_alloc, "lpage"))
- ret = setup_pcpu_lpage(static_size, true);
- else if (!strcmp(pcpu_chosen_alloc, "embed"))
- ret = setup_pcpu_embed(static_size, true);
- else
- pr_warning("PERCPU: unknown allocator %s "
- "specified\n", pcpu_chosen_alloc);
- if (ret < 0)
- pr_warning("PERCPU: %s allocator failed (%zd), "
- "falling back to 4k\n",
- pcpu_chosen_alloc, ret);
- }
- } else {
- ret = setup_pcpu_lpage(static_size, false);
- if (ret < 0)
- ret = setup_pcpu_embed(static_size, false);
+#ifdef CONFIG_X86_32
+ if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
+ pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+ rc = -EINVAL;
+ if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+ const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
+ const size_t dyn_size = PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+
+ rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ dyn_size, atom_size,
+ pcpu_cpu_distance,
+ pcpu_fc_alloc, pcpu_fc_free);
+ if (rc < 0)
+ pr_warning("PERCPU: %s allocator failed (%d), "
+ "falling back to page size\n",
+ pcpu_fc_names[pcpu_chosen_fc], rc);
}
- if (ret < 0)
- ret = setup_pcpu_4k(static_size);
- if (ret < 0)
- panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
- static_size, ret);
-
- pcpu_unit_size = ret;
+ if (rc < 0)
+ rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ pcpu_fc_alloc, pcpu_fc_free,
+ pcpup_populate_pte);
+ if (rc < 0)
+ panic("cannot initialize percpu area (err=%d)", rc);
/* alrighty, percpu areas up and running */
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
- per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+ per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
new file mode 100644
index 00000000000..34e09938265
--- /dev/null
+++ b/arch/x86/kernel/sfi.c
@@ -0,0 +1,122 @@
+/*
+ * sfi.c - x86 architecture SFI support.
+ *
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#define KMSG_COMPONENT "SFI"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/io.h>
+
+#include <asm/io_apic.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+
+void __init mp_sfi_register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+ if (boot_cpu_physical_apicid == -1U)
+ boot_cpu_physical_apicid = read_apic_id();
+
+ pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
+}
+
+/* All CPUs enumerated by SFI must be present and enabled */
+void __cpuinit mp_sfi_register_lapic(u8 id)
+{
+ if (MAX_APICS - id <= 0) {
+ pr_warning("Processor #%d invalid (max %d)\n",
+ id, MAX_APICS);
+ return;
+ }
+
+ pr_info("registering lapic[%d]\n", id);
+
+ generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
+}
+
+static int __init sfi_parse_cpus(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_cpu_table_entry *pentry;
+ int i;
+ int cpu_num;
+
+ sb = (struct sfi_table_simple *)table;
+ cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
+ pentry = (struct sfi_cpu_table_entry *)sb->pentry;
+
+ for (i = 0; i < cpu_num; i++) {
+ mp_sfi_register_lapic(pentry->apic_id);
+ pentry++;
+ }
+
+ smp_found_config = 1;
+ return 0;
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+static u32 gsi_base;
+
+static int __init sfi_parse_ioapic(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_apic_table_entry *pentry;
+ int i, num;
+
+ sb = (struct sfi_table_simple *)table;
+ num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
+ pentry = (struct sfi_apic_table_entry *)sb->pentry;
+
+ for (i = 0; i < num; i++) {
+ mp_register_ioapic(i, pentry->phys_addr, gsi_base);
+ gsi_base += io_apic_get_redir_entries(i);
+ pentry++;
+ }
+
+ WARN(pic_mode, KERN_WARNING
+ "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
+ pic_mode = 0;
+ return 0;
+}
+#endif /* CONFIG_X86_IO_APIC */
+
+/*
+ * sfi_platform_init(): register lapics & io-apics
+ */
+int __init sfi_platform_init(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ mp_sfi_register_lapic_address(sfi_lapic_addr);
+ sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
+#endif
+ return 0;
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 81e58238c4c..fbf3b07c856 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs)
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
- /*
- * Re-enable any watchpoints before delivering the
- * signal to user space. The processor register will
- * have been cleared if the watchpoint triggered
- * inside the kernel.
- */
- if (current->thread.debugreg7)
- set_debugreg(current->thread.debugreg7, 7);
-
/* Whee! Actually deliver the signal. */
if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
/*
@@ -856,7 +847,7 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
mce_notify_process();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c36cc1452cd..324f2a44c22 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
#include <linux/bootmem.h>
#include <linux/err.h>
#include <linux/nmi.h>
+#include <linux/tboot.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -323,7 +324,7 @@ notrace static void __cpuinit start_secondary(void *unused)
/* enable local interrupts */
local_irq_enable();
- setup_secondary_clock();
+ x86_cpuinit.setup_percpu_clockev();
wmb();
cpu_idle();
@@ -1058,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
#endif
current_thread_info()->cpu = 0; /* needed? */
for_each_possible_cpu(i) {
- alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
- cpumask_clear(per_cpu(cpu_core_map, i));
- cpumask_clear(per_cpu(cpu_sibling_map, i));
- cpumask_clear(cpu_data(i).llc_shared_map);
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
}
set_cpu_sibling_map(0);
@@ -1113,13 +1111,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
printk(KERN_INFO "CPU%d: ", 0);
print_cpu_info(&cpu_data(0));
- setup_boot_clock();
+ x86_init.timers.setup_percpu_clockev();
if (is_uv_system())
uv_system_init();
+
+ set_mtrr_aps_delayed_init();
out:
preempt_enable();
}
+
+void arch_enable_nonboot_cpus_begin(void)
+{
+ set_mtrr_aps_delayed_init();
+}
+
+void arch_enable_nonboot_cpus_end(void)
+{
+ mtrr_aps_init();
+}
+
/*
* Early setup to make printk work.
*/
@@ -1141,6 +1152,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
setup_ioapic_dest();
#endif
check_nmi_watchdog();
+ mtrr_aps_init();
}
static int __initdata setup_possible_cpus = -1;
@@ -1238,16 +1250,7 @@ static void __ref remove_cpu_from_maps(int cpu)
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
- /*
- * HACK:
- * Allow any queued timer interrupts to get serviced
- * This is only a temporary solution until we cleanup
- * fixup_irqs as we do for IA64.
- */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
@@ -1318,6 +1321,7 @@ void play_dead_common(void)
void native_play_dead(void)
{
play_dead_common();
+ tboot_shutdown(TB_SHUTDOWN_WFS);
wbinvd_halt();
}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321ddafd..0157cd26d7c 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,4 +335,4 @@ ENTRY(sys_call_table)
.long sys_preadv
.long sys_pwritev
.long sys_rt_tgsigqueueinfo /* 335 */
- .long sys_perf_counter_open
+ .long sys_perf_event_open
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 00000000000..86c9f91b48a
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,447 @@
+/*
+ * tboot.c: main implementation of helper functions used by kernel for
+ * runtime support of Intel(R) Trusted Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dma_remapping.h>
+#include <linux/init_task.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+#include <linux/cpu.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/tboot.h>
+
+#include <asm/trampoline.h>
+#include <asm/processor.h>
+#include <asm/bootparam.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+
+#include "acpi/realmode/wakeup.h"
+
+/* Global pointer to shared data; NULL means no measured launch. */
+struct tboot *tboot __read_mostly;
+
+/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
+#define AP_WAIT_TIMEOUT 1
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tboot: " fmt
+
+static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
+
+void __init tboot_probe(void)
+{
+ /* Look for valid page-aligned address for shared page. */
+ if (!boot_params.tboot_addr)
+ return;
+ /*
+ * also verify that it is mapped as we expect it before calling
+ * set_fixmap(), to reduce chance of garbage value causing crash
+ */
+ if (!e820_any_mapped(boot_params.tboot_addr,
+ boot_params.tboot_addr, E820_RESERVED)) {
+ pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
+ return;
+ }
+
+ /* only a natively booted kernel should be using TXT */
+ if (paravirt_enabled()) {
+ pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
+ return;
+ }
+
+ /* Map and check for tboot UUID. */
+ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
+ tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warning("tboot at 0x%llx is invalid\n",
+ boot_params.tboot_addr);
+ tboot = NULL;
+ return;
+ }
+ if (tboot->version < 5) {
+ pr_warning("tboot version is invalid: %u\n", tboot->version);
+ tboot = NULL;
+ return;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+}
+
+static pgd_t *tboot_pg_dir;
+static struct mm_struct tboot_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ .cpu_vm_mask = CPU_MASK_ALL,
+};
+
+static inline void switch_to_tboot_pt(void)
+{
+ write_cr3(virt_to_phys(tboot_pg_dir));
+}
+
+static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
+ pgprot_t prot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(&tboot_mm, vaddr);
+ pud = pud_alloc(&tboot_mm, pgd, vaddr);
+ if (!pud)
+ return -1;
+ pmd = pmd_alloc(&tboot_mm, pud, vaddr);
+ if (!pmd)
+ return -1;
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ if (!pte)
+ return -1;
+ set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
+ pte_unmap(pte);
+ return 0;
+}
+
+static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
+ unsigned long nr)
+{
+ /* Reuse the original kernel mapping */
+ tboot_pg_dir = pgd_alloc(&tboot_mm);
+ if (!tboot_pg_dir)
+ return -1;
+
+ for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
+ if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
+ return -1;
+ }
+
+ return 0;
+}
+
+static void tboot_create_trampoline(void)
+{
+ u32 map_base, map_size;
+
+ /* Create identity map for tboot shutdown code. */
+ map_base = PFN_DOWN(tboot->tboot_base);
+ map_size = PFN_UP(tboot->tboot_size);
+ if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
+ panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
+ map_base, map_size);
+}
+
+#ifdef CONFIG_ACPI_SLEEP
+
+static void add_mac_region(phys_addr_t start, unsigned long size)
+{
+ struct tboot_mac_region *mr;
+ phys_addr_t end = start + size;
+
+ if (start && size) {
+ mr = &tboot->mac_regions[tboot->num_mac_regions++];
+ mr->start = round_down(start, PAGE_SIZE);
+ mr->size = round_up(end, PAGE_SIZE) - mr->start;
+ }
+}
+
+static int tboot_setup_sleep(void)
+{
+ tboot->num_mac_regions = 0;
+
+ /* S3 resume code */
+ add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+
+#ifdef CONFIG_X86_TRAMPOLINE
+ /* AP trampoline code */
+ add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
+#endif
+
+ /* kernel code + data + bss */
+ add_mac_region(virt_to_phys(_text), _end - _text);
+
+ tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+
+ return 0;
+}
+
+#else /* no CONFIG_ACPI_SLEEP */
+
+static int tboot_setup_sleep(void)
+{
+ /* S3 shutdown requested, but S3 not supported by the kernel... */
+ BUG();
+ return -1;
+}
+
+#endif
+
+void tboot_shutdown(u32 shutdown_type)
+{
+ void (*shutdown)(void);
+
+ if (!tboot_enabled())
+ return;
+
+ /*
+ * if we're being called before the 1:1 mapping is set up then just
+ * return and let the normal shutdown happen; this should only be
+ * due to very early panic()
+ */
+ if (!tboot_pg_dir)
+ return;
+
+ /* if this is S3 then set regions to MAC */
+ if (shutdown_type == TB_SHUTDOWN_S3)
+ if (tboot_setup_sleep())
+ return;
+
+ tboot->shutdown_type = shutdown_type;
+
+ switch_to_tboot_pt();
+
+ shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
+ shutdown();
+
+ /* should not reach here */
+ while (1)
+ halt();
+}
+
+static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
+{
+#define TB_COPY_GAS(tbg, g) \
+ tbg.space_id = g.space_id; \
+ tbg.bit_width = g.bit_width; \
+ tbg.bit_offset = g.bit_offset; \
+ tbg.access_width = g.access_width; \
+ tbg.address = g.address;
+
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
+
+ /*
+ * We need phys addr of waking vector, but can't use virt_to_phys() on
+ * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
+ * addr.
+ */
+ tboot->acpi_sinfo.wakeup_vector = fadt->facs +
+ offsetof(struct acpi_table_facs, firmware_waking_vector);
+}
+
+void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+{
+ static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
+ /* S0,1,2: */ -1, -1, -1,
+ /* S3: */ TB_SHUTDOWN_S3,
+ /* S4: */ TB_SHUTDOWN_S4,
+ /* S5: */ TB_SHUTDOWN_S5 };
+
+ if (!tboot_enabled())
+ return;
+
+ tboot_copy_fadt(&acpi_gbl_FADT);
+ tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
+ tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
+ /* we always use the 32b wakeup vector */
+ tboot->acpi_sinfo.vector_width = 32;
+
+ if (sleep_state >= ACPI_S_STATE_COUNT ||
+ acpi_shutdown_map[sleep_state] == -1) {
+ pr_warning("unsupported sleep state 0x%x\n", sleep_state);
+ return;
+ }
+
+ tboot_shutdown(acpi_shutdown_map[sleep_state]);
+}
+
+static atomic_t ap_wfs_count;
+
+static int tboot_wait_for_aps(int num_aps)
+{
+ unsigned long timeout;
+
+ timeout = AP_WAIT_TIMEOUT*HZ;
+ while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+ timeout) {
+ mdelay(1);
+ timeout--;
+ }
+
+ if (timeout)
+ pr_warning("tboot wait for APs timeout\n");
+
+ return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
+}
+
+static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_DYING:
+ atomic_inc(&ap_wfs_count);
+ if (num_online_cpus() == 1)
+ if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+ return NOTIFY_BAD;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block tboot_cpu_notifier __cpuinitdata =
+{
+ .notifier_call = tboot_cpu_callback,
+};
+
+static __init int tboot_late_init(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_create_trampoline();
+
+ atomic_set(&ap_wfs_count, 0);
+ register_hotcpu_notifier(&tboot_cpu_notifier);
+ return 0;
+}
+
+late_initcall(tboot_late_init);
+
+/*
+ * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
+ */
+
+#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
+#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
+
+/* # pages for each config regs space - used by fixmap */
+#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
+ TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
+
+/* offsets from pub/priv config space */
+#define TXTCR_HEAP_BASE 0x0300
+#define TXTCR_HEAP_SIZE 0x0308
+
+#define SHA1_SIZE 20
+
+struct sha1_hash {
+ u8 hash[SHA1_SIZE];
+};
+
+struct sinit_mle_data {
+ u32 version; /* currently 6 */
+ struct sha1_hash bios_acm_id;
+ u32 edx_senter_flags;
+ u64 mseg_valid;
+ struct sha1_hash sinit_hash;
+ struct sha1_hash mle_hash;
+ struct sha1_hash stm_hash;
+ struct sha1_hash lcp_policy_hash;
+ u32 lcp_policy_control;
+ u32 rlp_wakeup_addr;
+ u32 reserved;
+ u32 num_mdrs;
+ u32 mdrs_off;
+ u32 num_vtd_dmars;
+ u32 vtd_dmars_off;
+} __packed;
+
+struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
+{
+ void *heap_base, *heap_ptr, *config;
+
+ if (!tboot_enabled())
+ return dmar_tbl;
+
+ /*
+ * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+ * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+ */
+
+ /* map config space in order to get heap addr */
+ config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
+ PAGE_SIZE);
+ if (!config)
+ return NULL;
+
+ /* now map TXT heap */
+ heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
+ *(u64 *)(config + TXTCR_HEAP_SIZE));
+ iounmap(config);
+ if (!heap_base)
+ return NULL;
+
+ /* walk heap to SinitMleData */
+ /* skip BiosData */
+ heap_ptr = heap_base + *(u64 *)heap_base;
+ /* skip OsMleData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* skip OsSinitData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* now points to SinitMleDataSize; set to SinitMleData */
+ heap_ptr += sizeof(u64);
+ /* get addr of DMAR table */
+ dmar_tbl = (struct acpi_table_header *)(heap_ptr +
+ ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
+ sizeof(u64));
+
+ /* don't unmap heap because dmar.c needs access to this */
+
+ return dmar_tbl;
+}
+
+int tboot_force_iommu(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ if (no_iommu || swiotlb || dmar_disabled)
+ pr_warning("Forcing Intel-IOMMU to enabled\n");
+
+ dmar_disabled = 0;
+#ifdef CONFIG_SWIOTLB
+ swiotlb = 0;
+#endif
+ no_iommu = 0;
+
+ return 1;
+}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 00000000000..be2573448ed
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 1991,1992,1995 Linus Torvalds
+ * Copyright (c) 1994 Alan Modra
+ * Copyright (c) 1995 Markus Kuhn
+ * Copyright (c) 1996 Ingo Molnar
+ * Copyright (c) 1998 Andrea Arcangeli
+ * Copyright (c) 2002,2006 Vojtech Pavlik
+ * Copyright (c) 2003 Andi Kleen
+ *
+ */
+
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/mca.h>
+
+#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
+#include <asm/i8259.h>
+#include <asm/i8253.h>
+#include <asm/timer.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
+
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
+int timer_ack;
+#endif
+
+#ifdef CONFIG_X86_64
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+#endif
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+ unsigned long pc = instruction_pointer(regs);
+
+ if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+ return *(unsigned long *)(regs->bp + sizeof(long));
+#else
+ unsigned long *sp =
+ (unsigned long *)kernel_stack_pointer(regs);
+ /*
+ * Return address is either directly at stack pointer
+ * or above a saved flags. Eflags has bits 22-31 zero,
+ * kernel addresses don't.
+ */
+ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
+#endif
+ }
+ return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * Default timer interrupt handler for PIT/HPET
+ */
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+ /* Keep nmi watchdog up to date */
+ inc_irq_stat(irq0_irqs);
+
+ /* Optimized out for !IO_APIC and x86_64 */
+ if (timer_ack) {
+ /*
+ * Subtle, when I/O APICs are used we have to ack timer IRQ
+ * manually to deassert NMI lines for the watchdog if run
+ * on an 82489DX-based system.
+ */
+ spin_lock(&i8259A_lock);
+ outb(0x0c, PIC_MASTER_OCW3);
+ /* Ack the IRQ; AEOI will end it automatically. */
+ inb(PIC_MASTER_POLL);
+ spin_unlock(&i8259A_lock);
+ }
+
+ global_clock_event->event_handler(global_clock_event);
+
+ /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
+ if (MCA_bus)
+ outb_p(inb_p(0x61)| 0x80, 0x61);
+
+ return IRQ_HANDLED;
+}
+
+static struct irqaction irq0 = {
+ .handler = timer_interrupt,
+ .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+ .name = "timer"
+};
+
+void __init setup_default_timer_irq(void)
+{
+ setup_irq(0, &irq0);
+}
+
+/* Default timer init function */
+void __init hpet_time_init(void)
+{
+ if (!hpet_enable())
+ setup_pit_timer();
+ setup_default_timer_irq();
+}
+
+static __init void x86_late_time_init(void)
+{
+ x86_init.timers.timer_init();
+ tsc_init();
+}
+
+/*
+ * Initialize TSC and delay the periodic timer init to
+ * late x86_late_time_init() so ioremap works.
+ */
+void __init time_init(void)
+{
+ late_time_init = x86_late_time_init;
+}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
deleted file mode 100644
index 5c5d87f0b2e..00000000000
--- a/arch/x86/kernel/time_32.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (C) 1991, 1992, 1995 Linus Torvalds
- *
- * This file contains the PC-specific time handling details:
- * reading the RTC at bootup, etc..
- * 1994-07-02 Alan Modra
- * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
- * 1995-03-26 Markus Kuhn
- * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
- * precision CMOS clock update
- * 1996-05-03 Ingo Molnar
- * fixed time warps in do_[slow|fast]_gettimeoffset()
- * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
- * "A Kernel Model for Precision Timekeeping" by Dave Mills
- * 1998-09-05 (Various)
- * More robust do_fast_gettimeoffset() algorithm implemented
- * (works with APM, Cyrix 6x86MX and Centaur C6),
- * monotonic gettimeofday() with fast_get_timeoffset(),
- * drift-proof precision TSC calibration on boot
- * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
- * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
- * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
- * 1998-12-16 Andrea Arcangeli
- * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
- * because was not accounting lost_ticks.
- * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
- * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
- * serialize accesses to xtime/lost_ticks).
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/mca.h>
-
-#include <asm/setup.h>
-#include <asm/hpet.h>
-#include <asm/time.h>
-#include <asm/timer.h>
-
-#include <asm/do_timer.h>
-
-int timer_ack;
-
-unsigned long profile_pc(struct pt_regs *regs)
-{
- unsigned long pc = instruction_pointer(regs);
-
-#ifdef CONFIG_SMP
- if (!user_mode_vm(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + sizeof(long));
-#else
- unsigned long *sp = (unsigned long *)&regs->sp;
-
- /* Return address is either directly at stack pointer
- or above a saved flags. Eflags has bits 22-31 zero,
- kernel addresses don't. */
- if (sp[0] >> 22)
- return sp[0];
- if (sp[1] >> 22)
- return sp[1];
-#endif
- }
-#endif
- return pc;
-}
-EXPORT_SYMBOL(profile_pc);
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
- /* Keep nmi watchdog up to date */
- inc_irq_stat(irq0_irqs);
-
-#ifdef CONFIG_X86_IO_APIC
- if (timer_ack) {
- /*
- * Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to deassert NMI lines for the watchdog if run
- * on an 82489DX-based system.
- */
- spin_lock(&i8259A_lock);
- outb(0x0c, PIC_MASTER_OCW3);
- /* Ack the IRQ; AEOI will end it automatically. */
- inb(PIC_MASTER_POLL);
- spin_unlock(&i8259A_lock);
- }
-#endif
-
- do_timer_interrupt_hook();
-
-#ifdef CONFIG_MCA
- if (MCA_bus) {
- /* The PS/2 uses level-triggered interrupts. You can't
- turn them off, nor would you want to (any attempt to
- enable edge-triggered interrupts usually gets intercepted by a
- special hardware circuit). Hence we have to acknowledge
- the timer interrupt. Through some incredibly stupid
- design idea, the reset for IRQ 0 is done by setting the
- high bit of the PPI port B (0x61). Note that some PS/2s,
- notably the 55SX, work fine if this is removed. */
-
- u8 irq_v = inb_p(0x61); /* read the current state */
- outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */
- }
-#endif
-
- return IRQ_HANDLED;
-}
-
-/* Duplicate of time_init() below, with hpet_enable part added */
-void __init hpet_time_init(void)
-{
- if (!hpet_enable())
- setup_pit_timer();
- x86_quirk_time_init();
-}
-
-/*
- * This is called directly from init code; we must delay timer setup in the
- * HPET case as we can't make the decision to turn on HPET this early in the
- * boot process.
- *
- * The chosen time_init function will usually be hpet_time_init, above, but
- * in the case of virtual hardware, an alternative function may be substituted.
- */
-void __init time_init(void)
-{
- x86_quirk_pre_time_init();
- tsc_init();
- late_time_init = choose_time_init();
-}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
deleted file mode 100644
index 5ba343e6184..00000000000
--- a/arch/x86/kernel/time_64.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * "High Precision Event Timer" based timekeeping.
- *
- * Copyright (c) 1991,1992,1995 Linus Torvalds
- * Copyright (c) 1994 Alan Modra
- * Copyright (c) 1995 Markus Kuhn
- * Copyright (c) 1996 Ingo Molnar
- * Copyright (c) 1998 Andrea Arcangeli
- * Copyright (c) 2002,2006 Vojtech Pavlik
- * Copyright (c) 2003 Andi Kleen
- * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
- */
-
-#include <linux/clockchips.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/mca.h>
-#include <linux/nmi.h>
-
-#include <asm/i8253.h>
-#include <asm/hpet.h>
-#include <asm/vgtod.h>
-#include <asm/time.h>
-#include <asm/timer.h>
-
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-
-unsigned long profile_pc(struct pt_regs *regs)
-{
- unsigned long pc = instruction_pointer(regs);
-
- /* Assume the lock function has either no stack frame or a copy
- of flags from PUSHF
- Eflags always has bits 22 and up cleared unlike kernel addresses. */
- if (!user_mode_vm(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + sizeof(long));
-#else
- unsigned long *sp = (unsigned long *)regs->sp;
- if (sp[0] >> 22)
- return sp[0];
- if (sp[1] >> 22)
- return sp[1];
-#endif
- }
- return pc;
-}
-EXPORT_SYMBOL(profile_pc);
-
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
- inc_irq_stat(irq0_irqs);
-
- global_clock_event->event_handler(global_clock_event);
-
-#ifdef CONFIG_MCA
- if (MCA_bus) {
- u8 irq_v = inb_p(0x61); /* read the current state */
- outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
- }
-#endif
-
- return IRQ_HANDLED;
-}
-
-/* calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency */
-#define TICK_COUNT 100000000
-unsigned long __init calibrate_cpu(void)
-{
- int tsc_start, tsc_now;
- int i, no_ctr_free;
- unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
- unsigned long flags;
-
- for (i = 0; i < 4; i++)
- if (avail_to_resrv_perfctr_nmi_bit(i))
- break;
- no_ctr_free = (i == 4);
- if (no_ctr_free) {
- WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
- "cpu_khz value may be incorrect.\n");
- i = 3;
- rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- rdmsrl(MSR_K7_PERFCTR3, pmc3);
- } else {
- reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
- local_irq_save(flags);
- /* start measuring cycles, incrementing from 0 */
- wrmsrl(MSR_K7_PERFCTR0 + i, 0);
- wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
- rdtscl(tsc_start);
- do {
- rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
- tsc_now = get_cycles();
- } while ((tsc_now - tsc_start) < TICK_COUNT);
-
- local_irq_restore(flags);
- if (no_ctr_free) {
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- wrmsrl(MSR_K7_PERFCTR3, pmc3);
- wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
- } else {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-
- return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-
-static struct irqaction irq0 = {
- .handler = timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
- .name = "timer"
-};
-
-void __init hpet_time_init(void)
-{
- if (!hpet_enable())
- setup_pit_timer();
-
- setup_irq(0, &irq0);
-}
-
-void __init time_init(void)
-{
- tsc_init();
-
- late_time_init = choose_time_init();
-}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 503c1f2e883..1740c85e24b 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -23,8 +23,6 @@
static struct bau_control **uv_bau_table_bases __read_mostly;
static int uv_bau_retry_limit __read_mostly;
-/* position of pnode (which is nasid>>1): */
-static int uv_nshift __read_mostly;
/* base pnode in this partition */
static int uv_partition_base_pnode __read_mostly;
@@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode)
BUG_ON(!adp);
pa = uv_gpa(adp); /* need the real nasid*/
- n = pa >> uv_nshift;
+ n = uv_gpa_to_pnode(pa);
m = pa & uv_mmask;
uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
* need the pnode of where the memory was really allocated
*/
pa = uv_gpa(pqp);
- pn = pa >> uv_nshift;
+ pn = uv_gpa_to_pnode(pa);
uv_write_global_mmr64(pnode,
UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
@@ -843,8 +841,7 @@ static int __init uv_bau_init(void)
GFP_KERNEL, cpu_to_node(cur_cpu));
uv_bau_retry_limit = 1;
- uv_nshift = uv_hub_info->n_val;
- uv_mmask = (1UL << uv_hub_info->n_val) - 1;
+ uv_mmask = (1UL << uv_hub_info->m_val) - 1;
nblades = uv_num_possible_blades();
uv_bau_table_bases = (struct bau_control **)
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 808031a5ba1..cd022121cab 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,8 +3,16 @@
#include <asm/trampoline.h>
#include <asm/e820.h>
+#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
+#define __trampinit
+#define __trampinitdata
+#else
+#define __trampinit __cpuinit
+#define __trampinitdata __cpuinitdata
+#endif
+
/* ready for x86_64 and x86 */
-unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
+unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE);
void __init reserve_trampoline_memory(void)
{
@@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void)
* bootstrap into the page concerned. The caller
* has made sure it's suitably aligned.
*/
-unsigned long setup_trampoline(void)
+unsigned long __trampinit setup_trampoline(void)
{
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 66d874e5404..8508237e8e4 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -28,16 +28,12 @@
*/
#include <linux/linkage.h>
+#include <linux/init.h>
#include <asm/segment.h>
#include <asm/page_types.h>
/* We can free up trampoline after bootup if cpu hotplug is not supported. */
-#ifndef CONFIG_HOTPLUG_CPU
-.section ".cpuinit.data","aw",@progbits
-#else
-.section .rodata,"a",@progbits
-#endif
-
+__CPUINITRODATA
.code16
ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index cddfb8d386b..3af2dff58b2 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,14 +25,19 @@
*/
#include <linux/linkage.h>
+#include <linux/init.h>
#include <asm/pgtable_types.h>
#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/segment.h>
#include <asm/processor-flags.h>
+#ifdef CONFIG_ACPI_SLEEP
.section .rodata, "a", @progbits
-
+#else
+/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
+__CPUINITRODATA
+#endif
.code16
ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 83264922a87..33399176512 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -14,7 +14,6 @@
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
-#include <linux/utsname.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -59,12 +58,12 @@
#include <asm/mach_traps.h>
#ifdef CONFIG_X86_64
+#include <asm/x86_init.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
-#include <asm/traps.h>
asmlinkage int system_call(void);
@@ -73,11 +72,9 @@ char ignore_fpu_irq;
/*
* The IDT has to be page-aligned to simplify the Pentium
- * F0 0F bug workaround.. We have a special link segment
- * for this.
+ * F0 0F bug workaround.
*/
-gate_desc idt_table[NR_VECTORS]
- __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
+gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
#endif
DECLARE_BITMAP(used_vectors, NR_VECTORS);
@@ -532,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
- unsigned long condition;
+ unsigned long dr6;
int si_code;
- get_debugreg(condition, 6);
+ get_debugreg(dr6, 6);
/* Catch kmemcheck conditions first of all! */
- if (condition & DR_STEP && kmemcheck_trap(regs))
+ if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
return;
+ /* DR6 may or may not be cleared by the CPU */
+ set_debugreg(0, 6);
/*
* The processor cleared BTF, so don't mark that we need it set.
*/
clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
tsk->thread.debugctlmsr = 0;
- if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
- SIGTRAP) == NOTIFY_STOP)
+ /* Store the virtualized DR6 value */
+ tsk->thread.debugreg6 = dr6;
+
+ if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
+ SIGTRAP) == NOTIFY_STOP)
return;
/* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs);
- /* Mask out spurious debug traps due to lazy DR7 setting */
- if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
- if (!tsk->thread.debugreg7)
- goto clear_dr7;
+ if (regs->flags & X86_VM_MASK) {
+ handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, 1);
+ return;
}
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK)
- goto debug_vm86;
-#endif
-
- /* Save debug status register where ptrace can see it */
- tsk->thread.debugreg6 = condition;
-
/*
- * Single-stepping through TF: make sure we ignore any events in
- * kernel space (but re-enable TF when returning to user mode).
+ * Single-stepping through system calls: ignore any exceptions in
+ * kernel space, but re-enable TF when returning to user mode.
+ *
+ * We already checked v86 mode above, so we can check for kernel mode
+ * by just checking the CPL of CS.
*/
- if (condition & DR_STEP) {
- if (!user_mode(regs))
- goto clear_TF_reenable;
+ if ((dr6 & DR_STEP) && !user_mode(regs)) {
+ tsk->thread.debugreg6 &= ~DR_STEP;
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+ regs->flags &= ~X86_EFLAGS_TF;
}
-
- si_code = get_si_code(condition);
- /* Ok, finally something we can handle */
- send_sigtrap(tsk, regs, error_code, si_code);
-
- /*
- * Disable additional traps. They'll be re-enabled when
- * the signal is delivered.
- */
-clear_dr7:
- set_debugreg(0, 7);
+ si_code = get_si_code(tsk->thread.debugreg6);
+ if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
+ send_sigtrap(tsk, regs, error_code, si_code);
preempt_conditional_cli(regs);
- return;
-#ifdef CONFIG_X86_32
-debug_vm86:
- /* reenable preemption: handle_vm86_trap() might sleep */
- dec_preempt_count();
- handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- conditional_cli(regs);
- return;
-#endif
-
-clear_TF_reenable:
- set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
- regs->flags &= ~X86_EFLAGS_TF;
- preempt_conditional_cli(regs);
return;
}
@@ -972,7 +948,5 @@ void __init trap_init(void)
*/
cpu_init();
-#ifdef CONFIG_X86_32
- x86_quirk_trap_init();
-#endif
+ x86_init.irqs.trap_init();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357..cd982f48e23 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,6 +17,8 @@
#include <asm/time.h>
#include <asm/delay.h>
#include <asm/hypervisor.h>
+#include <asm/nmi.h>
+#include <asm/x86_init.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
+ unsigned long flags, latch, ms, fast_calibrate;
int hpet = is_hpet_enabled(), i, loopmin;
- hv_tsc_khz = get_hypervisor_tsc_freq();
- if (hv_tsc_khz) {
- printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
- return hv_tsc_khz;
- }
-
local_irq_save(flags);
fast_calibrate = quick_pit_calibrate();
local_irq_restore(flags);
@@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void)
unsigned long cpu_khz_old = cpu_khz;
if (cpu_has_tsc) {
- tsc_khz = calibrate_tsc();
+ tsc_khz = x86_platform.calibrate_tsc();
cpu_khz = tsc_khz;
cpu_data(0).loops_per_jiffy =
cpufreq_scale(cpu_data(0).loops_per_jiffy,
@@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
(val == CPUFREQ_RESUMECHANGE)) {
- *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+ *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
@@ -744,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void)
}
#endif
+static void resume_tsc(void)
+{
+ clocksource_tsc.cycle_last = 0;
+}
+
static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
+ .resume = resume_tsc,
.mask = CLOCKSOURCE_MASK(64),
.shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -761,12 +763,14 @@ void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
- printk("Marking TSC unstable due to %s\n", reason);
+ printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
- clocksource_change_rating(&clocksource_tsc, 0);
- else
+ clocksource_mark_unstable(&clocksource_tsc);
+ else {
+ clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
clocksource_tsc.rating = 0;
+ }
}
}
@@ -852,15 +856,71 @@ static void __init init_tsc_clocksource(void)
clocksource_register(&clocksource_tsc);
}
+#ifdef CONFIG_X86_64
+/*
+ * calibrate_cpu is used on systems with fixed rate TSCs to determine
+ * processor frequency
+ */
+#define TICK_COUNT 100000000
+static unsigned long __init calibrate_cpu(void)
+{
+ int tsc_start, tsc_now;
+ int i, no_ctr_free;
+ unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
+ unsigned long flags;
+
+ for (i = 0; i < 4; i++)
+ if (avail_to_resrv_perfctr_nmi_bit(i))
+ break;
+ no_ctr_free = (i == 4);
+ if (no_ctr_free) {
+ WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
+ "cpu_khz value may be incorrect.\n");
+ i = 3;
+ rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
+ wrmsrl(MSR_K7_EVNTSEL3, 0);
+ rdmsrl(MSR_K7_PERFCTR3, pmc3);
+ } else {
+ reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ }
+ local_irq_save(flags);
+ /* start measuring cycles, incrementing from 0 */
+ wrmsrl(MSR_K7_PERFCTR0 + i, 0);
+ wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
+ rdtscl(tsc_start);
+ do {
+ rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
+ tsc_now = get_cycles();
+ } while ((tsc_now - tsc_start) < TICK_COUNT);
+
+ local_irq_restore(flags);
+ if (no_ctr_free) {
+ wrmsrl(MSR_K7_EVNTSEL3, 0);
+ wrmsrl(MSR_K7_PERFCTR3, pmc3);
+ wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
+ } else {
+ release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ }
+
+ return pmc_now * tsc_khz / (tsc_now - tsc_start);
+}
+#else
+static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
+#endif
+
void __init tsc_init(void)
{
u64 lpj;
int cpu;
+ x86_init.timers.tsc_pre_init();
+
if (!cpu_has_tsc)
return;
- tsc_khz = calibrate_tsc();
+ tsc_khz = x86_platform.calibrate_tsc();
cpu_khz = tsc_khz;
if (!tsc_khz) {
@@ -868,11 +928,9 @@ void __init tsc_init(void)
return;
}
-#ifdef CONFIG_X86_64
if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
cpu_khz = calibrate_cpu();
-#endif
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 027b5b49899..f37930954d1 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
return;
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
- pr_info("Skipping synchronization checks as TSC is reliable.\n");
+ printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n");
return;
}
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e..61d805df4c9 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,25 @@
*/
#include <linux/module.h>
+#include <linux/rbtree.h>
#include <linux/irq.h>
#include <asm/apic.h>
#include <asm/uv/uv_irq.h>
+#include <asm/uv/uv_hub.h>
+
+/* MMR offset and pnode of hub sourcing interrupts for a given irq */
+struct uv_irq_2_mmr_pnode{
+ struct rb_node list;
+ unsigned long offset;
+ int pnode;
+ int irq;
+};
+
+static spinlock_t uv_irq_lock;
+static struct rb_root uv_irq_root;
+
+static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
static void uv_noop(unsigned int irq)
{
@@ -39,25 +54,214 @@ struct irq_chip uv_irq_chip = {
.unmask = uv_noop,
.eoi = uv_ack_apic,
.end = uv_noop,
+ .set_affinity = uv_set_irq_affinity,
};
/*
+ * Add offset and pnode information of the hub sourcing interrupts to the
+ * rb tree for a specific irq.
+ */
+static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
+{
+ struct rb_node **link = &uv_irq_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct uv_irq_2_mmr_pnode *n;
+ struct uv_irq_2_mmr_pnode *e;
+ unsigned long irqflags;
+
+ n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
+ uv_blade_to_memory_nid(blade));
+ if (!n)
+ return -ENOMEM;
+
+ n->irq = irq;
+ n->offset = offset;
+ n->pnode = uv_blade_to_pnode(blade);
+ spin_lock_irqsave(&uv_irq_lock, irqflags);
+ /* Find the right place in the rbtree: */
+ while (*link) {
+ parent = *link;
+ e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
+
+ if (unlikely(irq == e->irq)) {
+ /* irq entry exists */
+ e->pnode = uv_blade_to_pnode(blade);
+ e->offset = offset;
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+ kfree(n);
+ return 0;
+ }
+
+ if (irq < e->irq)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ /* Insert the node into the rbtree. */
+ rb_link_node(&n->list, parent, link);
+ rb_insert_color(&n->list, &uv_irq_root);
+
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+ return 0;
+}
+
+/* Retrieve offset and pnode information from the rb tree for a specific irq */
+int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
+{
+ struct uv_irq_2_mmr_pnode *e;
+ struct rb_node *n;
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&uv_irq_lock, irqflags);
+ n = uv_irq_root.rb_node;
+ while (n) {
+ e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
+
+ if (e->irq == irq) {
+ *offset = e->offset;
+ *pnode = e->pnode;
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+ return 0;
+ }
+
+ if (irq < e->irq)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+ return -1;
+}
+
+/*
+ * Re-target the irq to the specified CPU and enable the specified MMR located
+ * on the specified blade to allow the sending of MSIs to the specified CPU.
+ */
+static int
+arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
+ unsigned long mmr_offset, int restrict)
+{
+ const struct cpumask *eligible_cpu = cpumask_of(cpu);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
+ int mmr_pnode;
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+ sizeof(unsigned long));
+
+ cfg = irq_cfg(irq);
+
+ err = assign_irq_vector(irq, cfg, eligible_cpu);
+ if (err != 0)
+ return err;
+
+ if (restrict == UV_AFFINITY_CPU)
+ desc->status |= IRQ_NO_BALANCING;
+ else
+ desc->status |= IRQ_MOVE_PCNTXT;
+
+ set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
+ irq_name);
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ entry->vector = cfg->vector;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
+
+ return irq;
+}
+
+/*
+ * Disable the specified MMR located on the specified blade so that MSIs are
+ * longer allowed to be sent.
+ */
+static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
+{
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+ sizeof(unsigned long));
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ entry->mask = 1;
+
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+}
+
+static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
+ unsigned int dest;
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ unsigned long mmr_offset;
+ unsigned mmr_pnode;
+
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
+ return -1;
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+
+ entry->vector = cfg->vector;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = dest;
+
+ /* Get previously stored MMR and pnode of hub sourcing interrupts */
+ if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
+ return -1;
+
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
+
+ return 0;
+}
+
+/*
* Set up a mapping of an available irq and vector, and enable the specified
* MMR that defines the MSI that is to be sent to the specified CPU when an
* interrupt is raised.
*/
int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
- unsigned long mmr_offset)
+ unsigned long mmr_offset, int restrict)
{
- int irq;
- int ret;
+ int irq, ret;
+
+ irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
- irq = create_irq();
if (irq <= 0)
return -EBUSY;
- ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
- if (ret != irq)
+ ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
+ restrict);
+ if (ret == irq)
+ uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
+ else
destroy_irq(irq);
return ret;
@@ -71,9 +275,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
*
* Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
*/
-void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
+void uv_teardown_irq(unsigned int irq)
{
- arch_disable_uv_irq(mmr_blade, mmr_offset);
+ struct uv_irq_2_mmr_pnode *e;
+ struct rb_node *n;
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&uv_irq_lock, irqflags);
+ n = uv_irq_root.rb_node;
+ while (n) {
+ e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
+ if (e->irq == irq) {
+ arch_disable_uv_irq(e->pnode, e->offset);
+ rb_erase(n, &uv_irq_root);
+ kfree(e);
+ break;
+ }
+ if (irq < e->irq)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
destroy_irq(irq);
}
EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 31ffc24eec4..abda6f53e71 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -30,6 +30,7 @@
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/e820.h>
+#include <asm/time.h>
#include <asm/io.h>
#include <linux/kernel_stat.h>
@@ -53,7 +54,7 @@ int is_visws_box(void)
return visws_board_type >= 0;
}
-static int __init visws_time_init(void)
+static void __init visws_time_init(void)
{
printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -66,21 +67,13 @@ static int __init visws_time_init(void)
/* Enable (unmask) the timer interrupt */
co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
- /*
- * Zero return means the generic timer setup code will set up
- * the standard vector:
- */
- return 0;
+ setup_default_timer_irq();
}
-static int __init visws_pre_intr_init(void)
+/* Replaces the default init_ISA_irqs in the generic setup */
+static void __init visws_pre_intr_init(void)
{
init_VISWS_APIC_irqs();
-
- /*
- * We dont want ISA irqs to be set up by the generic code:
- */
- return 1;
}
/* Quirk for machine specific memory setup. */
@@ -156,12 +149,8 @@ static void visws_machine_power_off(void)
outl(PIIX_SPECIAL_STOP, 0xCFC);
}
-static int __init visws_get_smp_config(unsigned int early)
+static void __init visws_get_smp_config(unsigned int early)
{
- /*
- * Prevent MP-table parsing by the generic code:
- */
- return 1;
}
/*
@@ -194,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
return;
}
- apic_cpus = apic->apicid_to_cpu_present(m->apicid);
+ apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
/*
* Validate version
@@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
apic_version[m->apicid] = ver;
}
-static int __init visws_find_smp_config(unsigned int reserve)
+static void __init visws_find_smp_config(unsigned int reserve)
{
struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve)
MP_processor_info(mp++);
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- return 1;
}
-static int visws_trap_init(void);
-
-static struct x86_quirks visws_x86_quirks __initdata = {
- .arch_time_init = visws_time_init,
- .arch_pre_intr_init = visws_pre_intr_init,
- .arch_memory_setup = visws_memory_setup,
- .arch_intr_init = NULL,
- .arch_trap_init = visws_trap_init,
- .mach_get_smp_config = visws_get_smp_config,
- .mach_find_smp_config = visws_find_smp_config,
-};
+static void visws_trap_init(void);
void __init visws_early_detect(void)
{
@@ -257,11 +234,14 @@ void __init visws_early_detect(void)
return;
/*
- * Install special quirks for timer, interrupt and memory setup:
- * Fall back to generic behavior for traps:
- * Override generic MP-table parsing:
+ * Override the default platform setup functions
*/
- x86_quirks = &visws_x86_quirks;
+ x86_init.resources.memory_setup = visws_memory_setup;
+ x86_init.mpparse.get_smp_config = visws_get_smp_config;
+ x86_init.mpparse.find_smp_config = visws_find_smp_config;
+ x86_init.irqs.pre_vector_init = visws_pre_intr_init;
+ x86_init.irqs.trap_init = visws_trap_init;
+ x86_init.timers.timer_init = visws_time_init;
/*
* Install reboot quirks:
@@ -400,12 +380,10 @@ static __init void cobalt_init(void)
co_apic_read(CO_APIC_ID));
}
-static int __init visws_trap_init(void)
+static void __init visws_trap_init(void)
{
lithium_init();
cobalt_init();
-
- return 1;
}
/*
@@ -508,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq)
}
static struct irq_chip cobalt_irq_type = {
- .typename = "Cobalt-APIC",
+ .name = "Cobalt-APIC",
.startup = startup_cobalt_irq,
.shutdown = disable_cobalt_irq,
.enable = enable_cobalt_irq,
@@ -545,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq)
}
static struct irq_chip piix4_master_irq_type = {
- .typename = "PIIX4-master",
+ .name = "PIIX4-master",
.startup = startup_piix4_master_irq,
.ack = ack_cobalt_irq,
.end = end_piix4_master_irq,
@@ -553,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = {
static struct irq_chip piix4_virtual_irq_type = {
- .typename = "PIIX4-virtual",
+ .name = "PIIX4-virtual",
.shutdown = disable_8259A_irq,
.enable = enable_8259A_irq,
.disable = disable_8259A_irq,
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95a7289e4b0..d430e4c3019 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -648,7 +648,7 @@ static inline int __init activate_vmi(void)
pv_info.paravirt_enabled = 1;
pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
- pv_info.name = "vmi";
+ pv_info.name = "vmi [deprecated]";
pv_init_ops.patch = vmi_patch;
@@ -817,15 +817,15 @@ static inline int __init activate_vmi(void)
vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
vmi_timer_ops.cancel_alarm =
vmi_get_function(VMI_CALL_CancelAlarm);
- pv_time_ops.time_init = vmi_time_init;
- pv_time_ops.get_wallclock = vmi_get_wallclock;
- pv_time_ops.set_wallclock = vmi_set_wallclock;
+ x86_init.timers.timer_init = vmi_time_init;
#ifdef CONFIG_X86_LOCAL_APIC
- pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
- pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
+ x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
+ x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
#endif
pv_time_ops.sched_clock = vmi_sched_clock;
- pv_time_ops.get_tsc_khz = vmi_tsc_khz;
+ x86_platform.calibrate_tsc = vmi_tsc_khz;
+ x86_platform.get_wallclock = vmi_get_wallclock;
+ x86_platform.set_wallclock = vmi_set_wallclock;
/* We have true wallclock functions; disable CMOS clock sync */
no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 2b3eb82efee..611b9e2360d 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void)
return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
}
-/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */
+/* x86_platform.calibrate_tsc = vmi_tsc_khz */
unsigned long vmi_tsc_khz(void)
{
unsigned long long khz;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9fc178255c0..3c68fe2d46c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -45,9 +45,9 @@ PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
#ifdef CONFIG_X86_64
- user PT_LOAD FLAGS(7); /* RWE */
+ user PT_LOAD FLAGS(5); /* R_E */
#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(7); /* RWE */
+ percpu PT_LOAD FLAGS(6); /* RW_ */
#endif
init PT_LOAD FLAGS(7); /* RWE */
#endif
@@ -65,17 +65,11 @@ SECTIONS
#endif
/* Text and read-only data */
-
- /* bootstrapping code */
- .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
- _text = .;
- *(.text.head)
- } :text = 0x9090
-
- /* The rest of the text */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
+ _text = .;
+ /* bootstrapping code */
+ HEAD_TEXT
#ifdef CONFIG_X86_32
- /* not really needed, already page aligned */
. = ALIGN(PAGE_SIZE);
*(.text.page_aligned)
#endif
@@ -94,13 +88,7 @@ SECTIONS
NOTES :text :note
- /* Exception table */
- . = ALIGN(16);
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
- __start___ex_table = .;
- *(__ex_table)
- __stop___ex_table = .;
- } :text = 0x9090
+ EXCEPTION_TABLE(16) :text = 0x9090
RO_DATA(PAGE_SIZE)
@@ -118,7 +106,6 @@ SECTIONS
#endif
PAGE_ALIGNED_DATA(PAGE_SIZE)
- *(.data.idt)
CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
@@ -135,24 +122,21 @@ SECTIONS
#ifdef CONFIG_X86_64
#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
- PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
- PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET)
#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+ . = ALIGN(4096);
+ __vsyscall_0 = .;
+
. = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+ .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
*(.vsyscall_0)
} :user
- __vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
*(.vsyscall_fn)
@@ -192,11 +176,9 @@ SECTIONS
*(.vsyscall_3)
}
- . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+ . = __vsyscall_0 + PAGE_SIZE;
#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
#undef VLOAD_OFFSET
#undef VLOAD
#undef VVIRT_OFFSET
@@ -219,36 +201,12 @@ SECTIONS
PERCPU_VADDR(0, :percpu)
#endif
- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- _sinittext = .;
- INIT_TEXT
- _einittext = .;
- }
+ INIT_TEXT_SECTION(PAGE_SIZE)
#ifdef CONFIG_X86_64
:init
#endif
- .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
- INIT_DATA
- }
-
- . = ALIGN(16);
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
- __setup_start = .;
- *(.init.setup)
- __setup_end = .;
- }
- .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
- __initcall_start = .;
- INITCALLS
- __initcall_end = .;
- }
-
- .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
- __con_initcall_start = .;
- *(.con_initcall.init)
- __con_initcall_end = .;
- }
+ INIT_DATA_SECTION(16)
.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
__x86_cpu_dev_start = .;
@@ -256,8 +214,6 @@ SECTIONS
__x86_cpu_dev_end = .;
}
- SECURITY_INIT
-
. = ALIGN(8);
.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
__parainstructions = .;
@@ -288,15 +244,6 @@ SECTIONS
EXIT_DATA
}
-#ifdef CONFIG_BLK_DEV_INITRD
- . = ALIGN(PAGE_SIZE);
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- }
-#endif
-
#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
PERCPU(PAGE_SIZE)
#endif
@@ -348,19 +295,19 @@ SECTIONS
_end = .;
}
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.eh_frame)
- *(.discard)
- }
-
STABS_DEBUG
DWARF_DEBUG
+
+ /* Sections to be discarded */
+ DISCARDS
+ /DISCARD/ : { *(.eh_frame) }
}
#ifdef CONFIG_X86_32
+/*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
#else
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 25ee06a80aa..8cb4974ff59 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+ vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
@@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
}
#ifdef CONFIG_SYSCTL
-
-static int
-vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-}
-
static ctl_table kernel_table2[] = {
{ .procname = "vsyscall64",
.data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = vsyscall_sysctl_change },
+ .proc_handler = proc_dointvec },
{}
};
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce..a1029769b6f 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -30,9 +30,8 @@ EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(copy_user_generic);
EXPORT_SYMBOL(__copy_user_nocache);
-EXPORT_SYMBOL(copy_from_user);
-EXPORT_SYMBOL(copy_to_user);
-EXPORT_SYMBOL(__copy_from_user_inatomic);
+EXPORT_SYMBOL(_copy_from_user);
+EXPORT_SYMBOL(_copy_to_user);
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 00000000000..d11c5ff7c65
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/init.h>
+
+#include <asm/bios_ebda.h>
+#include <asm/paravirt.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/tsc.h>
+#include <asm/iommu.h>
+
+void __cpuinit x86_init_noop(void) { }
+void __init x86_init_uint_noop(unsigned int unused) { }
+void __init x86_init_pgd_noop(pgd_t *unused) { }
+int __init iommu_init_noop(void) { return 0; }
+void iommu_shutdown_noop(void) { }
+
+/*
+ * The platform setup functions are preset with the default functions
+ * for standard PC hardware.
+ */
+struct x86_init_ops x86_init __initdata = {
+
+ .resources = {
+ .probe_roms = x86_init_noop,
+ .reserve_resources = reserve_standard_io_resources,
+ .memory_setup = default_machine_specific_memory_setup,
+ },
+
+ .mpparse = {
+ .mpc_record = x86_init_uint_noop,
+ .setup_ioapic_ids = x86_init_noop,
+ .mpc_apic_id = default_mpc_apic_id,
+ .smp_read_mpc_oem = default_smp_read_mpc_oem,
+ .mpc_oem_bus_info = default_mpc_oem_bus_info,
+ .find_smp_config = default_find_smp_config,
+ .get_smp_config = default_get_smp_config,
+ },
+
+ .irqs = {
+ .pre_vector_init = init_ISA_irqs,
+ .intr_init = native_init_IRQ,
+ .trap_init = x86_init_noop,
+ },
+
+ .oem = {
+ .arch_setup = x86_init_noop,
+ .banner = default_banner,
+ },
+
+ .paging = {
+ .pagetable_setup_start = native_pagetable_setup_start,
+ .pagetable_setup_done = native_pagetable_setup_done,
+ },
+
+ .timers = {
+ .setup_percpu_clockev = setup_boot_APIC_clock,
+ .tsc_pre_init = x86_init_noop,
+ .timer_init = hpet_time_init,
+ },
+
+ .iommu = {
+ .iommu_init = iommu_init_noop,
+ },
+};
+
+struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+ .setup_percpu_clockev = setup_secondary_APIC_clock,
+};
+
+struct x86_platform_ops x86_platform = {
+ .calibrate_tsc = native_calibrate_tsc,
+ .get_wallclock = mach_get_cmos_time,
+ .set_wallclock = mach_set_rtc_mmss,
+ .iommu_shutdown = iommu_shutdown_noop,
+};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8600a09e0c6..b84e571f417 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,12 +1,8 @@
#
# KVM configuration
#
-config HAVE_KVM
- bool
-config HAVE_KVM_IRQCHIP
- bool
- default y
+source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
@@ -29,6 +25,9 @@ config KVM
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select ANON_INODES
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_EVENTFD
+ select KVM_APIC_ARCHITECTURE
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
@@ -63,18 +62,6 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.
-config KVM_TRACE
- bool "KVM trace support"
- depends on KVM && SYSFS
- select MARKERS
- select RELAY
- select DEBUG_FS
- default n
- ---help---
- This option allows reading a trace of kvm-related events through
- relayfs. Note the ABI is not considered stable and will be
- modified in future updates.
-
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4efafe8..0e7fe78d0f7 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,22 +1,19 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o)
-ifeq ($(CONFIG_KVM_TRACE),y)
-common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
-endif
-ifeq ($(CONFIG_IOMMU_API),y)
-common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
-endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+CFLAGS_x86.o := -I.
+CFLAGS_svm.o := -I.
+CFLAGS_vmx.o := -I.
+
+kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
+ coalesced_mmio.o irq_comm.o eventfd.o)
+kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
+
+kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
+ i8254.o timer.o
+kvm-intel-y += vmx.o
+kvm-amd-y += svm.o
+
+obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c
index 616de4628d6..1be5cd640e9 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1,5 +1,5 @@
/******************************************************************************
- * x86_emulate.c
+ * emulate.c
*
* Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
*
@@ -30,7 +30,9 @@
#define DPRINTF(x...) do {} while (0)
#endif
#include <linux/module.h>
-#include <asm/kvm_x86_emulate.h>
+#include <asm/kvm_emulate.h>
+
+#include "mmu.h" /* for is_long_mode() */
/*
* Opcode effective-address decode tables.
@@ -60,6 +62,7 @@
#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
#define SrcOne (7<<4) /* Implied '1' */
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
+#define SrcImmU (9<<4) /* Immediate operand, unsigned */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -97,11 +100,11 @@ static u32 opcode_table[256] = {
/* 0x10 - 0x17 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
/* 0x18 - 0x1F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
/* 0x20 - 0x27 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +198,7 @@ static u32 opcode_table[256] = {
ByteOp | SrcImmUByte, SrcImmUByte,
/* 0xE8 - 0xEF */
SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
+ SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
@@ -208,7 +211,7 @@ static u32 opcode_table[256] = {
static u32 twobyte_table[256] = {
/* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
+ 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -216,7 +219,9 @@ static u32 twobyte_table[256] = {
ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ImplicitOps, 0, ImplicitOps, 0,
+ ImplicitOps, ImplicitOps, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x47 */
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -319,8 +324,11 @@ static u32 group2_table[] = {
};
/* EFLAGS bit definitions. */
+#define EFLG_VM (1<<17)
+#define EFLG_RF (1<<16)
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
+#define EFLG_IF (1<<9)
#define EFLG_SF (1<<7)
#define EFLG_ZF (1<<6)
#define EFLG_AF (1<<4)
@@ -1027,6 +1035,7 @@ done_prefixes:
c->src.type = OP_MEM;
break;
case SrcImm:
+ case SrcImmU:
c->src.type = OP_IMM;
c->src.ptr = (unsigned long *)c->eip;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
@@ -1044,6 +1053,19 @@ done_prefixes:
c->src.val = insn_fetch(s32, 4, c->eip);
break;
}
+ if ((c->d & SrcMask) == SrcImmU) {
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val &= 0xff;
+ break;
+ case 2:
+ c->src.val &= 0xffff;
+ break;
+ case 4:
+ c->src.val &= 0xffffffff;
+ break;
+ }
+ }
break;
case SrcImmByte:
case SrcImmUByte:
@@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
ctxt->interruptibility = mask;
}
+static inline void
+setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
+ struct kvm_segment *cs, struct kvm_segment *ss)
+{
+ memset(cs, 0, sizeof(struct kvm_segment));
+ kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
+ memset(ss, 0, sizeof(struct kvm_segment));
+
+ cs->l = 0; /* will be adjusted later */
+ cs->base = 0; /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ cs->limit = 0xffffffff; /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->present = 1;
+ cs->db = 1;
+
+ ss->unusable = 0;
+ ss->base = 0; /* flat segment */
+ ss->limit = 0xffffffff; /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->db = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->present = 1;
+}
+
+static int
+emulate_syscall(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+
+ /* syscall is not available in real mode */
+ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
+ || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
+ return -1;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs.selector = (u16)(msr_data & 0xfffc);
+ ss.selector = (u16)(msr_data + 8);
+
+ if (is_long_mode(ctxt->vcpu)) {
+ cs.db = 0;
+ cs.l = 1;
+ }
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ c->regs[VCPU_REGS_RCX] = c->eip;
+ if (is_long_mode(ctxt->vcpu)) {
+#ifdef CONFIG_X86_64
+ c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ c->eip = msr_data;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~(msr_data | EFLG_RF);
+#endif
+ } else {
+ /* legacy mode */
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ c->eip = (u32)msr_data;
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ }
+
+ return 0;
+}
+
+static int
+emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+
+ /* inject #UD if LOCK prefix is used */
+ if (c->lock_prefix)
+ return -1;
+
+ /* inject #GP if in real mode or paging is disabled */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ /* XXX sysenter/sysexit have not been tested in 64bit mode.
+ * Therefore, we inject an #UD.
+ */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return -1;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (ctxt->mode) {
+ case X86EMUL_MODE_PROT32:
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ break;
+ case X86EMUL_MODE_PROT64:
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ break;
+ }
+
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ cs.selector = (u16)msr_data;
+ cs.selector &= ~SELECTOR_RPL_MASK;
+ ss.selector = cs.selector + 8;
+ ss.selector &= ~SELECTOR_RPL_MASK;
+ if (ctxt->mode == X86EMUL_MODE_PROT64
+ || is_long_mode(ctxt->vcpu)) {
+ cs.db = 0;
+ cs.l = 1;
+ }
+
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ c->eip = msr_data;
+
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ c->regs[VCPU_REGS_RSP] = msr_data;
+
+ return 0;
+}
+
+static int
+emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment cs, ss;
+ u64 msr_data;
+ int usermode;
+
+ /* inject #UD if LOCK prefix is used */
+ if (c->lock_prefix)
+ return -1;
+
+ /* inject #GP if in real mode or paging is disabled */
+ if (ctxt->mode == X86EMUL_MODE_REAL
+ || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ /* sysexit must be called from CPL 0 */
+ if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ if ((c->rex_prefix & 0x8) != 0x0)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs.selector = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ ss.selector = (u16)(msr_data + 24);
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs.selector = (u16)(msr_data + 32);
+ if (msr_data == 0x0) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return -1;
+ }
+ ss.selector = cs.selector + 8;
+ cs.db = 0;
+ cs.l = 1;
+ break;
+ }
+ cs.selector |= SELECTOR_RPL_MASK;
+ ss.selector |= SELECTOR_RPL_MASK;
+
+ kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+ kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+ c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+
+ return 0;
+}
+
int
x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
@@ -1970,6 +2203,12 @@ twobyte_insn:
goto cannot_emulate;
}
break;
+ case 0x05: /* syscall */
+ if (emulate_syscall(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
case 0x06:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
@@ -2036,6 +2275,18 @@ twobyte_insn:
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
break;
+ case 0x34: /* sysenter */
+ if (emulate_sysenter(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
+ case 0x35: /* sysexit */
+ if (emulate_sysexit(ctxt) == -1)
+ goto cannot_emulate;
+ else
+ goto writeback;
+ break;
case 0x40 ... 0x4f: /* cmov */
c->dst.val = c->dst.orig_val = c->src.val;
if (!test_cc(c->b, ctxt->eflags))
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 21f68e00524..144e7f60b5e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -116,7 +116,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
* itself with the initial count and continues counting
* from there.
*/
- remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
+ remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
elapsed = mod_64(elapsed, ps->pit_timer.period);
@@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
+ if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending);
return 0;
}
@@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
struct hrtimer *timer;
- if (vcpu->vcpu_id != 0 || !pit)
+ if (!kvm_vcpu_is_bsp(vcpu) || !pit)
return;
timer = &pit->pit_state.pit_timer.timer;
@@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
pt->timer.function = kvm_timer_fn;
pt->t_ops = &kpit_ops;
pt->kvm = ps->pit->kvm;
- pt->vcpu_id = 0;
+ pt->vcpu = pt->kvm->bsp_vcpu;
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
@@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(ps, val, 0);
+ if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
+ create_pit_timer(ps, val, 0);
+ }
break;
case 2:
case 3:
- create_pit_timer(ps, val, 1);
+ if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
+ create_pit_timer(ps, val, 1);
+ }
break;
default:
destroy_pit_timer(&ps->pit_timer);
}
}
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+{
+ u8 saved_mode;
+ if (hpet_legacy_start) {
+ /* save existing mode for later reenablement */
+ saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
+ kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(kvm, channel, val);
+ kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+ } else {
+ pit_load_count(kvm, channel, val);
+ }
+}
+
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- pit_load_count(kvm, channel, val);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return container_of(dev, struct kvm_pit, speaker_dev);
}
-static void pit_ioport_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *data)
+static inline int pit_in_range(gpa_t addr)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
int channel, access;
struct kvm_kpit_channel_state *s;
u32 val = *(u32 *) data;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
val &= 0xff;
addr &= KVM_PIT_CHANNEL_MASK;
@@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this,
}
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static void pit_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
+static int pit_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
int ret, count;
struct kvm_kpit_channel_state *s;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
addr &= KVM_PIT_CHANNEL_MASK;
s = &pit_state->channels[addr];
@@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this,
memcpy(data, (char *)&ret, len);
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
-{
- return ((addr >= KVM_PIT_BASE_ADDRESS) &&
- (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
-}
-
-static void speaker_ioport_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *data)
+static int speaker_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
u32 val = *(u32 *) data;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
mutex_lock(&pit_state->lock);
pit_state->speaker_data_on = (val >> 1) & 1;
pit_set_gate(kvm, 2, val & 1);
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static void speaker_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
struct kvm *kvm = pit->kvm;
unsigned int refresh_clock;
int ret;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
/* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
@@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this,
len = sizeof(ret);
memcpy(data, (char *)&ret, len);
mutex_unlock(&pit_state->lock);
-}
-
-static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
-{
- return (addr == KVM_SPEAKER_BASE_ADDRESS);
+ return 0;
}
void kvm_pit_reset(struct kvm_pit *pit)
@@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
struct kvm_kpit_channel_state *c;
mutex_lock(&pit->pit_state.lock);
+ pit->pit_state.flags = 0;
for (i = 0; i < 3; i++) {
c = &pit->pit_state.channels[i];
c->mode = 0xff;
@@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
}
}
-struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+ .read = speaker_ioport_read,
+ .write = speaker_ioport_write,
+};
+
+/* Caller must have writers lock on slots_lock */
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
struct kvm_kpit_state *pit_state;
+ int ret;
pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
if (!pit)
@@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_lock(&pit->pit_state.lock);
spin_lock_init(&pit->pit_state.inject_lock);
- /* Initialize PIO device */
- pit->dev.read = pit_ioport_read;
- pit->dev.write = pit_ioport_write;
- pit->dev.in_range = pit_in_range;
- pit->dev.private = pit;
- kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
-
- pit->speaker_dev.read = speaker_ioport_read;
- pit->speaker_dev.write = speaker_ioport_write;
- pit->speaker_dev.in_range = speaker_in_range;
- pit->speaker_dev.private = pit;
- kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
-
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit->mask_notifier.func = pit_mask_notifer;
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+ ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+ if (ret < 0)
+ goto fail;
+
+ if (flags & KVM_PIT_SPEAKER_DUMMY) {
+ kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+ ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
+ &pit->speaker_dev);
+ if (ret < 0)
+ goto fail_unregister;
+ }
+
return pit;
+
+fail_unregister:
+ __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
+
+fail:
+ if (pit->irq_source_id >= 0)
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+
+ kfree(pit);
+ return NULL;
}
void kvm_free_pit(struct kvm *kvm)
@@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm)
if (kvm->arch.vpit) {
kvm_unregister_irq_mask_notifier(kvm, 0,
&kvm->arch.vpit->mask_notifier);
+ kvm_unregister_irq_ack_notifier(kvm,
+ &kvm->arch.vpit->pit_state.irq_ack_notifier);
mutex_lock(&kvm->arch.vpit->pit_state.lock);
timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
hrtimer_cancel(timer);
@@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
struct kvm_vcpu *vcpu;
int i;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->irq_lock);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->irq_lock);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
* VCPU0, and only if its LVT0 is in EXTINT mode.
*/
if (kvm->arch.vapics_in_nmi_mode > 0)
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (vcpu)
- kvm_apic_nmi_wd_deliver(vcpu);
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
}
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
@@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
struct kvm_kpit_state *ps;
- if (vcpu && pit) {
+ if (pit) {
int inject = 0;
ps = &pit->pit_state;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index bbd863ff60b..d4c1c7ffdc0 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -21,6 +21,7 @@ struct kvm_kpit_channel_state {
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
+ u32 flags;
struct kvm_timer pit_timer;
bool is_periodic;
u32 speaker_data_on;
@@ -49,8 +50,8 @@ struct kvm_pit {
#define KVM_PIT_CHANNEL_MASK 0x3
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
-struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
void kvm_pit_reset(struct kvm_pit *pit);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1ccb50c74f1..01f15168280 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,50 +30,24 @@
#include "irq.h"
#include <linux/kvm_host.h>
-
-static void pic_lock(struct kvm_pic *s)
- __acquires(&s->lock)
-{
- spin_lock(&s->lock);
-}
-
-static void pic_unlock(struct kvm_pic *s)
- __releases(&s->lock)
-{
- struct kvm *kvm = s->kvm;
- unsigned acks = s->pending_acks;
- bool wakeup = s->wakeup_needed;
- struct kvm_vcpu *vcpu;
-
- s->pending_acks = 0;
- s->wakeup_needed = false;
-
- spin_unlock(&s->lock);
-
- while (acks) {
- kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
- __ffs(acks));
- acks &= acks - 1;
- }
-
- if (wakeup) {
- vcpu = s->kvm->vcpus[0];
- if (vcpu)
- kvm_vcpu_kick(vcpu);
- }
-}
+#include "trace.h"
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
s->isr_ack |= (1 << irq);
+ if (s != &s->pics_state->pics[0])
+ irq += 8;
+ kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
+ spin_lock(&s->lock);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
+ spin_unlock(&s->lock);
}
/*
@@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
- pic_lock(s);
+ spin_lock(&s->lock);
pic_update_irq(s);
- pic_unlock(s);
+ spin_unlock(&s->lock);
}
int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
- pic_lock(s);
+ spin_lock(&s->lock);
if (irq >= 0 && irq < PIC_NUM_PINS) {
ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
}
- pic_unlock(s);
+ spin_unlock(&s->lock);
return ret;
}
@@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
- pic_lock(s);
+ spin_lock(&s->lock);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
- pic_unlock(s);
- kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
+ spin_unlock(&s->lock);
return intno;
}
@@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq, irqbase, n;
struct kvm *kvm = s->pics_state->irq_request_opaque;
- struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+ struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
if (s == &s->pics_state->pics[0])
irqbase = 0;
@@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
n = irq + irqbase;
- s->pics_state->pending_acks |= 1 << n;
+ kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
}
}
s->last_irr = 0;
@@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
return s->elcr;
}
-static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
+static int picdev_in_range(gpa_t addr)
{
switch (addr) {
case 0x20:
@@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
}
}
-static void picdev_write(struct kvm_io_device *this,
+static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pic, dev);
+}
+
+static int picdev_write(struct kvm_io_device *this,
gpa_t addr, int len, const void *val)
{
- struct kvm_pic *s = this->private;
+ struct kvm_pic *s = to_pic(this);
unsigned char data = *(unsigned char *)val;
+ if (!picdev_in_range(addr))
+ return -EOPNOTSUPP;
if (len != 1) {
if (printk_ratelimit())
printk(KERN_ERR "PIC: non byte write\n");
- return;
+ return 0;
}
- pic_lock(s);
+ spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
- pic_unlock(s);
+ spin_unlock(&s->lock);
+ return 0;
}
-static void picdev_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *val)
+static int picdev_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
{
- struct kvm_pic *s = this->private;
+ struct kvm_pic *s = to_pic(this);
unsigned char data = 0;
+ if (!picdev_in_range(addr))
+ return -EOPNOTSUPP;
if (len != 1) {
if (printk_ratelimit())
printk(KERN_ERR "PIC: non byte read\n");
- return;
+ return 0;
}
- pic_lock(s);
+ spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
- pic_unlock(s);
+ spin_unlock(&s->lock);
+ return 0;
}
/*
@@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this,
static void pic_irq_request(void *opaque, int level)
{
struct kvm *kvm = opaque;
- struct kvm_vcpu *vcpu = kvm->vcpus[0];
+ struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
s->output = level;
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
- s->wakeup_needed = true;
+ kvm_vcpu_kick(vcpu);
}
}
+static const struct kvm_io_device_ops picdev_ops = {
+ .read = picdev_read,
+ .write = picdev_write,
+};
+
struct kvm_pic *kvm_create_pic(struct kvm *kvm)
{
struct kvm_pic *s;
+ int ret;
+
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
@@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
/*
* Initialize PIO device
*/
- s->dev.read = picdev_read;
- s->dev.write = picdev_write;
- s->dev.in_range = picdev_in_range;
- s->dev.private = s;
- kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
+ kvm_iodevice_init(&s->dev, &picdev_ops);
+ ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
+ if (ret < 0) {
+ kfree(s);
+ return NULL;
+ }
+
return s;
}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9f593188129..7d6058a2fd3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,7 +63,6 @@ struct kvm_kpic_state {
struct kvm_pic {
spinlock_t lock;
- bool wakeup_needed;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1ff819dce7d..7bcc5b6a440 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
kvm_register_write(vcpu, VCPU_REGS_RIP, val);
}
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
+
+ return vcpu->arch.pdptrs[index];
+}
+
#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
deleted file mode 100644
index ed66e4c078d..00000000000
--- a/arch/x86/kvm/kvm_svm.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <asm/msr.h>
-
-#include <asm/svm.h>
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
- struct kvm_vcpu vcpu;
- struct vmcb *vmcb;
- unsigned long vmcb_pa;
- struct svm_cpu_data *svm_data;
- uint64_t asid_generation;
-
- u64 next_rip;
-
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
- unsigned long host_cr2;
-
- u32 *msrpm;
- struct vmcb *hsave;
- u64 hsave_msr;
-
- u64 nested_vmcb;
-
- /* These are the merged vectors */
- u32 *nested_msrpm;
-
- /* gpa pointers to the real vectors */
- u64 nested_vmcb_msrpm;
-};
-
-#endif
-
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 26bd6ba74e1..55c7524dda5 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -6,7 +6,7 @@ struct kvm_timer {
bool reinject;
struct kvm_timer_ops *t_ops;
struct kvm *kvm;
- int vcpu_id;
+ struct kvm_vcpu *vcpu;
};
struct kvm_timer_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ae99d83f81a..23c217692ea 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,8 +32,11 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/atomic.h>
+#include <asm/apicdef.h>
#include "kvm_cache_regs.h"
#include "irq.h"
+#include "trace.h"
+#include "x86.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
}
+void kvm_apic_set_version(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *feat;
+ u32 v = APIC_VERSION;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
+ if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
+ v |= APIC_LVR_DIRECTED_EOI;
+ apic_set_reg(apic, APIC_LVR, v);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap)
static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
{
+ apic->irr_pending = true;
return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
}
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+static inline int apic_search_irr(struct kvm_lapic *apic)
{
- apic_clear_vector(vec, apic->regs + APIC_IRR);
+ return find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
{
int result;
- result = find_highest_vector(apic->regs + APIC_IRR);
+ if (!apic->irr_pending)
+ return -1;
+
+ result = apic_search_irr(apic);
ASSERT(result == -1 || result >= 16);
return result;
}
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+}
+
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
+ /* This may race with setting of irr in __apic_accept_irq() and
+ * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+ * will cause vmexit immediately and the value will be recalculated
+ * on the next vmentry.
+ */
if (!apic)
return 0;
highest_irr = apic_find_highest_irr(apic);
return highest_irr;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode);
@@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
{
int result = 0;
- u8 logical_id;
+ u32 logical_id;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_id = apic_get_reg(apic, APIC_LDR);
+ return logical_id & mda;
+ }
logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
@@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
result = !apic_test_and_set_irr(vector, apic);
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector, !result);
if (!result) {
if (trig_mode)
apic_debug("level trig mode repeatedly for "
@@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) {
+ mutex_lock(&apic->vcpu->kvm->irq_lock);
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ mutex_unlock(&apic->vcpu->kvm->irq_lock);
+ }
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.level = icr_low & APIC_INT_ASSERT;
irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
irq.shorthand = icr_low & APIC_SHORT_MASK;
- irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+ if (apic_x2apic_mode(apic))
+ irq.dest_id = icr_high;
+ else
+ irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+
+ trace_kvm_apic_ipi(icr_low, irq.dest_id);
apic_debug("icr_high 0x%x, icr_low 0x%x, "
"short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
@@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
irq.vector);
+ mutex_lock(&apic->vcpu->kvm->irq_lock);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+ mutex_unlock(&apic->vcpu->kvm->irq_lock);
}
static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -464,7 +521,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
if (apic_get_reg(apic, APIC_TMICT) == 0)
return 0;
- remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
+ remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
if (ktime_to_ns(remaining) < 0)
remaining = ktime_set(0, 0);
@@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
{
u32 val = 0;
- KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
-
if (offset >= LAPIC_MMIO_LENGTH)
return 0;
switch (offset) {
+ case APIC_ID:
+ if (apic_x2apic_mode(apic))
+ val = kvm_apic_id(apic);
+ else
+ val = kvm_apic_id(apic) << 24;
+ break;
case APIC_ARBPRI:
printk(KERN_WARNING "Access APIC ARBPRI register "
"which is for P6\n");
@@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
return val;
}
-static void apic_mmio_read(struct kvm_io_device *this,
- gpa_t address, int len, void *data)
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_lapic, dev);
+}
+
+static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- unsigned int offset = address - apic->base_address;
unsigned char alignment = offset & 0xf;
u32 result;
+ /* this bitmask has a bit cleared for each reserver register */
+ static const u64 rmask = 0x43ff01ffffffe70cULL;
if ((alignment + len) > 4) {
- printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
- (unsigned long)address, len);
- return;
+ apic_debug("KVM_APIC_READ: alignment error %x %d\n",
+ offset, len);
+ return 1;
}
+
+ if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
+ apic_debug("KVM_APIC_READ: read reserved register %x\n",
+ offset);
+ return 1;
+ }
+
result = __apic_read(apic, offset & ~0xf);
+ trace_kvm_apic_read(offset, result);
+
switch (len) {
case 1:
case 2:
@@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this,
"should be 1,2, or 4 instead\n", len);
break;
}
+ return 0;
+}
+
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+ return apic_hw_enabled(apic) &&
+ addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_io_device *this,
+ gpa_t address, int len, void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ u32 offset = address - apic->base_address;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ apic_reg_read(apic, offset, len, data);
+
+ return 0;
}
static void update_divide_count(struct kvm_lapic *apic)
@@ -567,12 +664,21 @@ static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now = apic->lapic_timer.timer.base->get_time();
- apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
+ apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
APIC_BUS_CYCLE_NS * apic->divide_count;
atomic_set(&apic->lapic_timer.pending, 0);
if (!apic->lapic_timer.period)
return;
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic)) {
+ if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
+ apic->lapic_timer.period = NSEC_PER_MSEC/2;
+ }
hrtimer_start(&apic->lapic_timer.timer,
ktime_add_ns(now, apic->lapic_timer.period),
@@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
}
-static void apic_mmio_write(struct kvm_io_device *this,
- gpa_t address, int len, const void *data)
+static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- unsigned int offset = address - apic->base_address;
- unsigned char alignment = offset & 0xf;
- u32 val;
-
- /*
- * APIC register must be aligned on 128-bits boundary.
- * 32/64/128 bits registers must be accessed thru 32 bits.
- * Refer SDM 8.4.1
- */
- if (len != 4 || alignment) {
- /* Don't shout loud, $infamous_os would cause only noise. */
- apic_debug("apic write: bad size=%d %lx\n",
- len, (long)address);
- return;
- }
-
- val = *(u32 *) data;
-
- /* too common printing */
- if (offset != APIC_EOI)
- apic_debug("%s: offset 0x%x with length 0x%x, and value is "
- "0x%x\n", __func__, offset, len, val);
-
- offset &= 0xff0;
+ int ret = 0;
- KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
+ trace_kvm_apic_write(reg, val);
- switch (offset) {
+ switch (reg) {
case APIC_ID: /* Local APIC ID */
- apic_set_reg(apic, APIC_ID, val);
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_ID, val);
+ else
+ ret = 1;
break;
case APIC_TASKPRI:
@@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this,
break;
case APIC_LDR:
- apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ else
+ ret = 1;
break;
case APIC_DFR:
- apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ if (!apic_x2apic_mode(apic))
+ apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ else
+ ret = 1;
break;
- case APIC_SPIV:
- apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+ case APIC_SPIV: {
+ u32 mask = 0x3ff;
+ if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ mask |= APIC_SPIV_DIRECTED_EOI;
+ apic_set_reg(apic, APIC_SPIV, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
int i;
u32 lvt_val;
@@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
}
break;
-
+ }
case APIC_ICR:
/* No delay here, so we always clear the pending bit */
apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
@@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
break;
case APIC_ICR2:
- apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+ if (!apic_x2apic_mode(apic))
+ val &= 0xff000000;
+ apic_set_reg(apic, APIC_ICR2, val);
break;
case APIC_LVT0:
@@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
if (!apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
- val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
- apic_set_reg(apic, offset, val);
+ val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+ apic_set_reg(apic, reg, val);
break;
@@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
- return;
+ break;
case APIC_TDCR:
if (val & 4)
@@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this,
update_divide_count(apic);
break;
+ case APIC_ESR:
+ if (apic_x2apic_mode(apic) && val != 0) {
+ printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
+ ret = 1;
+ }
+ break;
+
+ case APIC_SELF_IPI:
+ if (apic_x2apic_mode(apic)) {
+ apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+ } else
+ ret = 1;
+ break;
default:
- apic_debug("Local APIC Write to read-only register %x\n",
- offset);
+ ret = 1;
break;
}
-
+ if (ret)
+ apic_debug("Local APIC Write to read-only register %x\n", reg);
+ return ret;
}
-static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
- int len, int size)
+static int apic_mmio_write(struct kvm_io_device *this,
+ gpa_t address, int len, const void *data)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- int ret = 0;
+ struct kvm_lapic *apic = to_lapic(this);
+ unsigned int offset = address - apic->base_address;
+ u32 val;
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
- if (apic_hw_enabled(apic) &&
- (addr >= apic->base_address) &&
- (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
- ret = 1;
+ /*
+ * APIC register must be aligned on 128-bits boundary.
+ * 32/64/128 bits registers must be accessed thru 32 bits.
+ * Refer SDM 8.4.1
+ */
+ if (len != 4 || (offset & 0xf)) {
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
+ return 0;
+ }
- return ret;
+ val = *(u32*)data;
+
+ /* too common printing */
+ if (offset != APIC_EOI)
+ apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+ "0x%x\n", __func__, offset, len, val);
+
+ apic_reg_write(apic, offset & 0xff0, val);
+
+ return 0;
}
void kvm_free_lapic(struct kvm_vcpu *vcpu)
@@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
| (apic_get_reg(apic, APIC_TASKPRI) & 4));
}
-EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
@@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
return (tpr & 0xf0) >> 4;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
@@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
vcpu->arch.apic_base = value;
return;
}
- if (apic->vcpu->vcpu_id)
+
+ if (!kvm_vcpu_is_bsp(apic->vcpu))
value &= ~MSR_IA32_APICBASE_BSP;
vcpu->arch.apic_base = value;
+ if (apic_x2apic_mode(apic)) {
+ u32 id = kvm_apic_id(apic);
+ u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
+ apic_set_reg(apic, APIC_LDR, ldr);
+ }
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
-
void kvm_lapic_reset(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
- apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+ kvm_apic_set_version(apic->vcpu);
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
@@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
+ apic->irr_pending = false;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
- if (vcpu->vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(vcpu))
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
apic_update_ppr(apic);
@@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
vcpu, kvm_apic_id(apic),
vcpu->arch.apic_base, apic->base_address);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reset);
bool kvm_apic_present(struct kvm_vcpu *vcpu)
{
@@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
{
return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
/*
*----------------------------------------------------------------------
@@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = {
.is_periodic = lapic_is_periodic,
};
+static const struct kvm_io_device_ops apic_mmio_ops = {
+ .read = apic_mmio_read,
+ .write = apic_mmio_write,
+};
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
apic->lapic_timer.timer.function = kvm_timer_fn;
apic->lapic_timer.t_ops = &lapic_timer_ops;
apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+ apic->lapic_timer.vcpu = vcpu;
apic->base_address = APIC_DEFAULT_PHYS_BASE;
vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
kvm_lapic_reset(vcpu);
- apic->dev.read = apic_mmio_read;
- apic->dev.write = apic_mmio_write;
- apic->dev.in_range = apic_mmio_range;
- apic->dev.private = apic;
+ kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
nomem_free_apic:
@@ -962,7 +1088,6 @@ nomem_free_apic:
nomem:
return -ENOMEM;
}
-EXPORT_SYMBOL_GPL(kvm_create_lapic);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
@@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (vcpu->vcpu_id == 0) {
+ if (kvm_vcpu_is_bsp(vcpu)) {
if (!apic_hw_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
@@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
apic->base_address = vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
- apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+ kvm_apic_set_version(vcpu);
+
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
update_divide_count(apic);
@@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
vcpu->arch.apic->vapic_addr = vapic_addr;
}
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (msr == 0x830)
+ apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return apic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ return 1;
+
+ if (apic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (msr == 0x830)
+ apic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a587f8349c4..40010b09c4a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,7 @@ struct kvm_lapic {
struct kvm_timer lapic_timer;
u32 divide_count;
struct kvm_vcpu *vcpu;
+ bool irr_pending;
struct page *regs_page;
void *regs;
gpa_t vapic_addr;
@@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_apic_set_version(struct kvm_vcpu *vcpu);
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
@@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0ef5bb2b404..818b92ad82c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
*/
#include "mmu.h"
+#include "kvm_cache_regs.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644);
#define PT32_LEVEL_MASK(level) \
(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+#define PT32_LVL_OFFSET_MASK(level) \
+ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
#define PT32_INDEX(address, level)\
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
@@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644);
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#define PT64_DIR_BASE_ADDR_MASK \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#define PT64_LVL_ADDR_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+#define PT32_LVL_ADDR_MASK(level) \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
| PT64_NX_MASK)
@@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644);
#define PFERR_RSVD_MASK (1U << 3)
#define PFERR_FETCH_MASK (1U << 4)
+#define PT_PDPE_LEVEL 3
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
@@ -139,10 +153,15 @@ module_param(oos_shadow, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
struct kvm_rmap_desc {
- u64 *shadow_ptes[RMAP_EXT];
+ u64 *sptes[RMAP_EXT];
struct kvm_rmap_desc *more;
};
@@ -239,16 +258,25 @@ static int is_writeble_pte(unsigned long pte)
return pte & PT_WRITABLE_MASK;
}
-static int is_dirty_pte(unsigned long pte)
+static int is_dirty_gpte(unsigned long pte)
{
- return pte & shadow_dirty_mask;
+ return pte & PT_DIRTY_MASK;
}
-static int is_rmap_pte(u64 pte)
+static int is_rmap_spte(u64 pte)
{
return is_shadow_present_pte(pte);
}
+static int is_last_spte(u64 pte, int level)
+{
+ if (level == PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (is_large_pte(pte))
+ return 1;
+ return 0;
+}
+
static pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -261,7 +289,7 @@ static gfn_t pse36_gfn_delta(u32 gpte)
return (gpte & PT32_DIR_PSE36_MASK) << shift;
}
-static void set_shadow_pte(u64 *sptep, u64 spte)
+static void __set_spte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64
set_64bit((unsigned long *)sptep, spte);
@@ -380,37 +408,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
* Return the pointer to the largepage write count for a given
* gfn, handling slots that are not large page aligned.
*/
-static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+static int *slot_largepage_idx(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
{
unsigned long idx;
- idx = (gfn / KVM_PAGES_PER_HPAGE) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE);
- return &slot->lpage_info[idx].write_count;
+ idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ return &slot->lpage_info[level - 2][idx].write_count;
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
int *write_count;
+ int i;
gfn = unalias_gfn(kvm, gfn);
- write_count = slot_largepage_idx(gfn,
- gfn_to_memslot_unaliased(kvm, gfn));
- *write_count += 1;
+
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ write_count = slot_largepage_idx(gfn, slot, i);
+ *write_count += 1;
+ }
}
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
int *write_count;
+ int i;
gfn = unalias_gfn(kvm, gfn);
- write_count = slot_largepage_idx(gfn,
- gfn_to_memslot_unaliased(kvm, gfn));
- *write_count -= 1;
- WARN_ON(*write_count < 0);
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
+ write_count = slot_largepage_idx(gfn, slot, i);
+ *write_count -= 1;
+ WARN_ON(*write_count < 0);
+ }
}
-static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+static int has_wrprotected_page(struct kvm *kvm,
+ gfn_t gfn,
+ int level)
{
struct kvm_memory_slot *slot;
int *largepage_idx;
@@ -418,47 +461,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
gfn = unalias_gfn(kvm, gfn);
slot = gfn_to_memslot_unaliased(kvm, gfn);
if (slot) {
- largepage_idx = slot_largepage_idx(gfn, slot);
+ largepage_idx = slot_largepage_idx(gfn, slot, level);
return *largepage_idx;
}
return 1;
}
-static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
+ unsigned long page_size = PAGE_SIZE;
struct vm_area_struct *vma;
unsigned long addr;
- int ret = 0;
+ int i, ret = 0;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
- return ret;
+ return page_size;
down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr);
- if (vma && is_vm_hugetlb_page(vma))
- ret = 1;
+ if (!vma)
+ goto out;
+
+ page_size = vma_kernel_pagesize(vma);
+
+out:
up_read(&current->mm->mmap_sem);
+ for (i = PT_PAGE_TABLE_LEVEL;
+ i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+ if (page_size >= KVM_HPAGE_SIZE(i))
+ ret = i;
+ else
+ break;
+ }
+
return ret;
}
-static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
-
- if (has_wrprotected_page(vcpu->kvm, large_gfn))
- return 0;
-
- if (!host_largepage_backed(vcpu->kvm, large_gfn))
- return 0;
+ int host_level;
+ int level = PT_PAGE_TABLE_LEVEL;
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
- return 0;
+ return PT_PAGE_TABLE_LEVEL;
- return 1;
+ host_level = host_mapping_level(vcpu->kvm, large_gfn);
+
+ if (host_level == PT_PAGE_TABLE_LEVEL)
+ return host_level;
+
+ for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) {
+
+ if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+ break;
+ }
+
+ return level - 1;
}
/*
@@ -466,19 +529,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
* Note: gfn must be unaliased before this function get called
*/
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
unsigned long idx;
slot = gfn_to_memslot(kvm, gfn);
- if (!lpage)
+ if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn / KVM_PAGES_PER_HPAGE) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+ idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
- return &slot->lpage_info[idx].rmap_pde;
+ return &slot->lpage_info[level - 2][idx].rmap_pde;
}
/*
@@ -494,42 +557,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
* the spte was not added.
*
*/
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
struct kvm_mmu_page *sp;
struct kvm_rmap_desc *desc;
unsigned long *rmapp;
int i, count = 0;
- if (!is_rmap_pte(*spte))
+ if (!is_rmap_spte(*spte))
return count;
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
sp->gfns[spte - sp->spt] = gfn;
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
*rmapp = (unsigned long)spte;
} else if (!(*rmapp & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_rmap_desc(vcpu);
- desc->shadow_ptes[0] = (u64 *)*rmapp;
- desc->shadow_ptes[1] = spte;
+ desc->sptes[0] = (u64 *)*rmapp;
+ desc->sptes[1] = spte;
*rmapp = (unsigned long)desc | 1;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
+ while (desc->sptes[RMAP_EXT-1] && desc->more) {
desc = desc->more;
count += RMAP_EXT;
}
- if (desc->shadow_ptes[RMAP_EXT-1]) {
+ if (desc->sptes[RMAP_EXT-1]) {
desc->more = mmu_alloc_rmap_desc(vcpu);
desc = desc->more;
}
- for (i = 0; desc->shadow_ptes[i]; ++i)
+ for (i = 0; desc->sptes[i]; ++i)
;
- desc->shadow_ptes[i] = spte;
+ desc->sptes[i] = spte;
}
return count;
}
@@ -541,14 +604,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
{
int j;
- for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+ for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
;
- desc->shadow_ptes[i] = desc->shadow_ptes[j];
- desc->shadow_ptes[j] = NULL;
+ desc->sptes[i] = desc->sptes[j];
+ desc->sptes[j] = NULL;
if (j != 0)
return;
if (!prev_desc && !desc->more)
- *rmapp = (unsigned long)desc->shadow_ptes[0];
+ *rmapp = (unsigned long)desc->sptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
@@ -566,17 +629,15 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
unsigned long *rmapp;
int i;
- if (!is_rmap_pte(*spte))
+ if (!is_rmap_spte(*spte))
return;
sp = page_header(__pa(spte));
pfn = spte_to_pfn(*spte);
if (*spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (is_writeble_pte(*spte))
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
+ kvm_set_pfn_dirty(pfn);
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
- if (desc->shadow_ptes[i] == spte) {
+ for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+ if (desc->sptes[i] == spte) {
rmap_desc_remove_entry(rmapp,
desc, i,
prev_desc);
@@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
prev_desc = NULL;
prev_spte = NULL;
while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+ for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
if (prev_spte == spte)
- return desc->shadow_ptes[i];
- prev_spte = desc->shadow_ptes[i];
+ return desc->sptes[i];
+ prev_spte = desc->sptes[i];
}
desc = desc->more;
}
@@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
{
unsigned long *rmapp;
u64 *spte;
- int write_protected = 0;
+ int i, write_protected = 0;
gfn = unalias_gfn(kvm, gfn);
- rmapp = gfn_to_rmap(kvm, gfn, 0);
+ rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writeble_pte(*spte)) {
- set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+ __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -664,27 +725,31 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
}
/* check for huge page mappings */
- rmapp = gfn_to_rmap(kvm, gfn, 1);
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- BUG_ON(!spte);
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
- pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writeble_pte(*spte)) {
- rmap_remove(kvm, spte);
- --kvm->stat.lpages;
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
- spte = NULL;
- write_protected = 1;
+ for (i = PT_DIRECTORY_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ rmapp = gfn_to_rmap(kvm, gfn, i);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+ pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+ if (is_writeble_pte(*spte)) {
+ rmap_remove(kvm, spte);
+ --kvm->stat.lpages;
+ __set_spte(spte, shadow_trap_nonpresent_pte);
+ spte = NULL;
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
}
- spte = rmap_next(kvm, rmapp, spte);
}
return write_protected;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
{
u64 *spte;
int need_tlb_flush = 0;
@@ -693,16 +758,55 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
rmap_remove(kvm, spte);
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
}
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ int need_flush = 0;
+ u64 *spte, new_spte;
+ pte_t *ptep = (pte_t *)data;
+ pfn_t new_pfn;
+
+ WARN_ON(pte_huge(*ptep));
+ new_pfn = pte_pfn(*ptep);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!is_shadow_present_pte(*spte));
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+ need_flush = 1;
+ if (pte_write(*ptep)) {
+ rmap_remove(kvm, spte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
+ spte = rmap_next(kvm, rmapp, NULL);
+ } else {
+ new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+ new_spte |= (u64)new_pfn << PAGE_SHIFT;
+
+ new_spte &= ~PT_WRITABLE_MASK;
+ new_spte &= ~SPTE_HOST_WRITEABLE;
+ if (is_writeble_pte(*spte))
+ kvm_set_pfn_dirty(spte_to_pfn(*spte));
+ __set_spte(spte, new_spte);
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ }
+ if (need_flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data))
{
- int i;
+ int i, j;
int retval = 0;
/*
@@ -721,11 +825,17 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- retval |= handler(kvm, &memslot->rmap[gfn_offset]);
- retval |= handler(kvm,
- &memslot->lpage_info[
- gfn_offset /
- KVM_PAGES_PER_HPAGE].rmap_pde);
+
+ retval |= handler(kvm, &memslot->rmap[gfn_offset],
+ data);
+
+ for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
+ int idx = gfn_offset;
+ idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+ retval |= handler(kvm,
+ &memslot->lpage_info[j][idx].rmap_pde,
+ data);
+ }
}
}
@@ -734,10 +844,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+ return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
{
u64 *spte;
int young = 0;
@@ -763,20 +879,23 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
#define RMAP_RECYCLE_THRESHOLD 1000
-static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
+static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
unsigned long *rmapp;
+ struct kvm_mmu_page *sp;
+
+ sp = page_header(__pa(spte));
gfn = unalias_gfn(vcpu->kvm, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+ return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}
#ifdef MMU_DEBUG
@@ -1109,6 +1228,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 1;
}
+ trace_kvm_mmu_sync_page(sp);
if (rmap_write_protect(vcpu->kvm, sp->gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1231,8 +1351,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- pgprintk("%s: looking gfn %lx role %x\n", __func__,
- gfn, role.word);
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
@@ -1249,14 +1367,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
kvm_mmu_mark_parents_unsync(vcpu, sp);
}
- pgprintk("%s: found\n", __func__);
+ trace_kvm_mmu_get_page(sp, false);
return sp;
}
++vcpu->kvm->stat.mmu_cache_miss;
sp = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!sp)
return sp;
- pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
@@ -1269,6 +1386,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
vcpu->arch.mmu.prefetch_page(vcpu, sp);
else
nonpaging_prefetch_page(vcpu, sp);
+ trace_kvm_mmu_get_page(sp, true);
return sp;
}
@@ -1292,6 +1410,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
{
if (iterator->level < PT_PAGE_TABLE_LEVEL)
return false;
+
+ if (iterator->level == PT_PAGE_TABLE_LEVEL)
+ if (is_large_pte(*iterator->sptep))
+ return false;
+
iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
return true;
@@ -1312,25 +1435,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
pt = sp->spt;
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (is_shadow_present_pte(pt[i]))
- rmap_remove(kvm, &pt[i]);
- pt[i] = shadow_trap_nonpresent_pte;
- }
- return;
- }
-
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
if (is_shadow_present_pte(ent)) {
- if (!is_large_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
ent &= PT64_BASE_ADDR_MASK;
mmu_page_remove_parent_pte(page_header(ent),
&pt[i]);
} else {
- --kvm->stat.lpages;
+ if (is_large_pte(ent))
+ --kvm->stat.lpages;
rmap_remove(kvm, &pt[i]);
}
}
@@ -1346,10 +1461,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
int i;
+ struct kvm_vcpu *vcpu;
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm->vcpus[i]->arch.last_pte_updated = NULL;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vcpu->arch.last_pte_updated = NULL;
}
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1368,7 +1483,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
BUG_ON(!parent_pte);
kvm_mmu_put_page(sp, parent_pte);
- set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+ __set_spte(parent_pte, shadow_trap_nonpresent_pte);
}
}
@@ -1400,6 +1515,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
int ret;
+
+ trace_kvm_mmu_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp);
@@ -1516,7 +1633,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
if (pt[i] == shadow_notrap_nonpresent_pte)
- set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+ __set_spte(&pt[i], shadow_trap_nonpresent_pte);
}
}
@@ -1646,6 +1763,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
struct kvm_mmu_page *s;
struct hlist_node *node, *n;
+ trace_kvm_mmu_unsync_page(sp);
index = kvm_page_table_hashfn(sp->gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
/* don't unsync if pagetable is shadowed with multiple roles */
@@ -1682,11 +1800,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 0;
}
-static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
- int write_fault, int dirty, int largepage,
+ int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync)
+ bool can_unsync, bool reset_host_protection)
{
u64 spte;
int ret = 0;
@@ -1707,18 +1825,22 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= shadow_nx_mask;
if (pte_access & ACC_USER_MASK)
spte |= shadow_user_mask;
- if (largepage)
+ if (level > PT_PAGE_TABLE_LEVEL)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
+ if (reset_host_protection)
+ spte |= SPTE_HOST_WRITEABLE;
+
spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
- if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
spte = shadow_trap_nonpresent_pte;
goto set_pte;
@@ -1732,7 +1854,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
* is responsibility of mmu_get_page / kvm_sync_page.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writeble_pte(*shadow_pte))
+ if (!can_unsync && is_writeble_pte(*sptep))
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1749,65 +1871,68 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- set_shadow_pte(shadow_pte, spte);
+ __set_spte(sptep, spte);
return ret;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, gfn_t gfn,
- pfn_t pfn, bool speculative)
+ int *ptwrite, int level, gfn_t gfn,
+ pfn_t pfn, bool speculative,
+ bool reset_host_protection)
{
int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*shadow_pte);
+ int was_writeble = is_writeble_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
- __func__, *shadow_pte, pt_access,
+ __func__, *sptep, pt_access,
write_fault, user_fault, gfn);
- if (is_rmap_pte(*shadow_pte)) {
+ if (is_rmap_spte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
*/
- if (largepage && !is_large_pte(*shadow_pte)) {
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ !is_large_pte(*sptep)) {
struct kvm_mmu_page *child;
- u64 pte = *shadow_pte;
+ u64 pte = *sptep;
child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, shadow_pte);
- } else if (pfn != spte_to_pfn(*shadow_pte)) {
+ mmu_page_remove_parent_pte(child, sptep);
+ } else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %lx new %lx\n",
- spte_to_pfn(*shadow_pte), pfn);
- rmap_remove(vcpu->kvm, shadow_pte);
+ spte_to_pfn(*sptep), pfn);
+ rmap_remove(vcpu->kvm, sptep);
} else
was_rmapped = 1;
}
- if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
- dirty, largepage, gfn, pfn, speculative, true)) {
+
+ if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
+ dirty, level, gfn, pfn, speculative, true,
+ reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
}
- pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+ pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
- is_large_pte(*shadow_pte)? "2MB" : "4kB",
- is_present_pte(*shadow_pte)?"RW":"R", gfn,
- *shadow_pte, shadow_pte);
- if (!was_rmapped && is_large_pte(*shadow_pte))
+ is_large_pte(*sptep)? "2MB" : "4kB",
+ *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+ *sptep, sptep);
+ if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
- page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+ page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
- if (!is_rmap_pte(*shadow_pte))
- kvm_release_pfn_clean(pfn);
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ kvm_release_pfn_clean(pfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, gfn, largepage);
+ rmap_recycle(vcpu, sptep, gfn);
} else {
if (was_writeble)
kvm_release_pfn_dirty(pfn);
@@ -1815,7 +1940,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
kvm_release_pfn_clean(pfn);
}
if (speculative) {
- vcpu->arch.last_pte_updated = shadow_pte;
+ vcpu->arch.last_pte_updated = sptep;
vcpu->arch.last_pte_gfn = gfn;
}
}
@@ -1825,7 +1950,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int largepage, gfn_t gfn, pfn_t pfn)
+ int level, gfn_t gfn, pfn_t pfn)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -1833,11 +1958,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
gfn_t pseudo_gfn;
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
- if (iterator.level == PT_PAGE_TABLE_LEVEL
- || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
+ if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
- largepage, gfn, pfn, false);
+ level, gfn, pfn, false, true);
++vcpu->stat.pf_fixed;
break;
}
@@ -1853,10 +1977,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
return -ENOMEM;
}
- set_shadow_pte(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
+ __set_spte(iterator.sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask);
}
}
return pt_write;
@@ -1865,14 +1989,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
- int largepage = 0;
+ int level;
pfn_t pfn;
unsigned long mmu_seq;
- if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- largepage = 1;
- }
+ level = mapping_level(vcpu, gfn);
+
+ /*
+ * This path builds a PAE pagetable - so we can map 2mb pages at
+ * maximum. Therefore check if the level is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -1888,7 +2018,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
+ r = __direct_map(vcpu, v, write, level, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -1954,6 +2084,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
gfn_t root_gfn;
struct kvm_mmu_page *sp;
int direct = 0;
+ u64 pdptr;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
@@ -1981,11 +2112,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+ pdptr = kvm_pdptr_read(vcpu, i);
+ if (!is_present_gpte(pdptr)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
- root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+ root_gfn = pdptr >> PAGE_SHIFT;
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
if (mmu_check_root(vcpu, root_gfn))
@@ -2062,7 +2194,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
{
pfn_t pfn;
int r;
- int largepage = 0;
+ int level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
@@ -2073,10 +2205,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r)
return r;
- if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- largepage = 1;
- }
+ level = mapping_level(vcpu, gfn);
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2089,7 +2221,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- largepage, gfn, pfn);
+ level, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -2206,7 +2338,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
- context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
+ context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 29);
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 20); /* large page */
@@ -2357,8 +2491,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
if (r)
goto out;
+ /* set_cr3() should ensure TLB has been flushed */
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
- kvm_mmu_flush_tlb(vcpu);
out:
return r;
}
@@ -2378,15 +2512,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
- if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
- is_large_pte(pte))
+ if (is_last_spte(pte, sp->role.level))
rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
}
}
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
if (is_large_pte(pte))
--vcpu->kvm->stat.lpages;
}
@@ -2397,11 +2530,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
const void *new)
{
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
- if (!vcpu->arch.update_pte.largepage ||
- sp->role.glevels == PT32_ROOT_LEVEL) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
}
++vcpu->kvm->stat.mmu_pte_updated;
@@ -2447,8 +2577,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 gpte = 0;
pfn_t pfn;
- vcpu->arch.update_pte.largepage = 0;
-
if (bytes != 4 && bytes != 8)
return;
@@ -2472,14 +2600,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if ((bytes == 4) && (gpa % 4 == 0))
memcpy((void *)&gpte, new, 4);
}
- if (!is_present_pte(gpte))
+ if (!is_present_gpte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
- if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- vcpu->arch.update_pte.largepage = 1;
- }
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2622,6 +2746,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
+ if (tdp_enabled)
+ return 0;
+
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
spin_lock(&vcpu->kvm->mmu_lock);
@@ -2633,7 +2760,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
+ while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+ !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
@@ -2670,8 +2798,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
++vcpu->stat.mmio_exits;
return 0;
case EMULATE_FAIL:
- kvm_report_emulation_failure(vcpu, "pagetable");
- return 1;
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ return 0;
default:
BUG();
}
@@ -2712,12 +2841,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
ASSERT(vcpu);
- if (vcpu->kvm->arch.n_requested_mmu_pages)
- vcpu->kvm->arch.n_free_mmu_pages =
- vcpu->kvm->arch.n_requested_mmu_pages;
- else
- vcpu->kvm->arch.n_free_mmu_pages =
- vcpu->kvm->arch.n_alloc_mmu_pages;
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
@@ -3029,6 +3152,24 @@ out:
return r;
}
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int nr_sptes = 0;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry(vcpu, addr, iterator) {
+ sptes[iterator.level-1] = *iterator.sptep;
+ nr_sptes++;
+ if (!is_shadow_present_pte(*iterator.sptep))
+ break;
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return nr_sptes;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
+
#ifdef AUDIT
static const char *audit_msg;
@@ -3041,6 +3182,54 @@ static gva_t canonicalize(gva_t gva)
return gva;
}
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(kvm, child, fn);
+ } else
+ fn(kvm, sp, &sp->spt[i]);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ }
+ }
+ return;
+}
+
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
gva_t va, int level)
{
@@ -3055,20 +3244,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
continue;
va = canonicalize(va);
- if (level > 1) {
- if (ent == shadow_notrap_nonpresent_pte)
- printk(KERN_ERR "audit: (%s) nontrapping pte"
- " in nonleaf level: levels %d gva %lx"
- " level %d pte %llx\n", audit_msg,
- vcpu->arch.mmu.root_level, va, level, ent);
- else
- audit_mappings_page(vcpu, ent, va, level - 1);
- } else {
+ if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
+ audit_mappings_page(vcpu, ent, va, level - 1);
+ else {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
gfn_t gfn = gpa >> PAGE_SHIFT;
pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ continue;
+ }
+
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
printk(KERN_ERR "xx audit error: (%s) levels %d"
@@ -3122,7 +3310,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
while (d) {
for (k = 0; k < RMAP_EXT; ++k)
- if (d->shadow_ptes[k])
+ if (d->sptes[k])
++nmaps;
else
break;
@@ -3133,9 +3321,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
return nmaps;
}
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
+void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+ if (*sptep & PT_WRITABLE_MASK) {
+ rev_sp = page_header(__pa(sptep));
+ gfn = rev_sp->gfns[sptep - rev_sp->spt];
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+ audit_msg, gfn);
+ printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+ audit_msg, sptep - rev_sp->spt,
+ rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
+ is_large_pte(*sptep));
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+ audit_msg, *sptep);
+ dump_stack();
+ }
+ }
+
+}
+
+void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
{
- int nmaps = 0;
struct kvm_mmu_page *sp;
int i;
@@ -3152,20 +3379,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
continue;
if (!(ent & PT_WRITABLE_MASK))
continue;
- ++nmaps;
+ inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
}
}
- return nmaps;
+ return;
}
static void audit_rmap(struct kvm_vcpu *vcpu)
{
- int n_rmap = count_rmaps(vcpu);
- int n_actual = count_writable_mappings(vcpu);
-
- if (n_rmap != n_actual)
- printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __func__, audit_msg, n_rmap, n_actual);
+ check_writable_mappings_rmap(vcpu);
+ count_rmaps(vcpu);
}
static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3173,20 +3396,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
struct kvm_memory_slot *slot;
unsigned long *rmapp;
+ u64 *spte;
gfn_t gfn;
list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
if (sp->role.direct)
continue;
+ if (sp->unsync)
+ continue;
gfn = unalias_gfn(vcpu->kvm, sp->gfn);
slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
- if (*rmapp)
- printk(KERN_ERR "%s: (%s) shadow page has writable"
- " mappings: gfn %lx role %x\n",
+
+ spte = rmap_next(vcpu->kvm, rmapp, NULL);
+ while (spte) {
+ if (*spte & PT_WRITABLE_MASK)
+ printk(KERN_ERR "%s: (%s) shadow page has "
+ "writable mappings: gfn %lx role %x\n",
__func__, audit_msg, sp->gfn,
sp->role.word);
+ spte = rmap_next(vcpu->kvm, rmapp, spte);
+ }
}
}
@@ -3198,7 +3429,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
audit_msg = msg;
audit_rmap(vcpu);
audit_write_protection(vcpu);
- audit_mappings(vcpu);
+ if (strcmp("pre pte write", audit_msg) != 0)
+ audit_mappings(vcpu);
+ audit_writable_sptes_have_rmaps(vcpu);
dbg = olddbg;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3494a2fb136..61a1b3884b4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,6 +37,8 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
@@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return vcpu->arch.cr0 & X86_CR0_PG;
}
-static inline int is_present_pte(unsigned long pte)
+static inline int is_present_gpte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
new file mode 100644
index 00000000000..3e4a5c6ca2a
--- /dev/null
+++ b/arch/x86/kvm/mmutrace.h
@@ -0,0 +1,220 @@
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/ftrace_event.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE mmutrace
+
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
+ __field(__u32, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
+ __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({ \
+ const char *ret = p->buffer + p->len; \
+ static const char *access_str[] = { \
+ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ }; \
+ union kvm_mmu_page_role role; \
+ \
+ role.word = __entry->role; \
+ \
+ trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \
+ " %snxe root %u %s%c", \
+ __entry->gfn, role.level, role.glevels, \
+ role.quadrant, \
+ role.direct ? " direct" : "", \
+ access_str[role.access], \
+ role.invalid ? " invalid" : "", \
+ role.cr4_pge ? "" : "!", \
+ role.nxe ? "" : "!", \
+ __entry->root_count, \
+ __entry->unsync ? "unsync" : "sync", 0); \
+ ret; \
+ })
+
+#define kvm_mmu_trace_pferr_flags \
+ { PFERR_PRESENT_MASK, "P" }, \
+ { PFERR_WRITE_MASK, "W" }, \
+ { PFERR_USER_MASK, "U" }, \
+ { PFERR_RSVD_MASK, "RSVD" }, \
+ { PFERR_FETCH_MASK, "F" }
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+ kvm_mmu_pagetable_walk,
+ TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
+ TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+
+ TP_STRUCT__entry(
+ __field(__u64, addr)
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
+ | (!!fetch_fault << 4);
+ ),
+
+ TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+ kvm_mmu_paging_element,
+ TP_PROTO(u64 pte, int level),
+ TP_ARGS(pte, level),
+
+ TP_STRUCT__entry(
+ __field(__u64, pte)
+ __field(__u32, level)
+ ),
+
+ TP_fast_assign(
+ __entry->pte = pte;
+ __entry->level = level;
+ ),
+
+ TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+/* We set a pte accessed bit */
+TRACE_EVENT(
+ kvm_mmu_set_accessed_bit,
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte dirty bit */
+TRACE_EVENT(
+ kvm_mmu_set_dirty_bit,
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+TRACE_EVENT(
+ kvm_mmu_walker_error,
+ TP_PROTO(u32 pferr),
+ TP_ARGS(pferr),
+
+ TP_STRUCT__entry(
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("pferr %x %s", __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+ kvm_mmu_get_page,
+ TP_PROTO(struct kvm_mmu_page *sp, bool created),
+ TP_ARGS(sp, created),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ __field(bool, created)
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ __entry->created = created;
+ ),
+
+ TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+ __entry->created ? "new" : "existing")
+);
+
+TRACE_EVENT(
+ kvm_mmu_sync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+TRACE_EVENT(
+ kvm_mmu_unsync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+TRACE_EVENT(
+ kvm_mmu_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+#endif /* _TRACE_KVMMMU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67785f63539..72558f8ff3f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -27,7 +27,8 @@
#define guest_walker guest_walker64
#define FNAME(name) paging##64_##name
#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
@@ -43,7 +44,8 @@
#define guest_walker guest_walker32
#define FNAME(name) paging##32_##name
#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
@@ -53,8 +55,8 @@
#error Invalid PTTYPE value
#endif
-#define gpte_to_gfn FNAME(gpte_to_gfn)
-#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
/*
* The guest_walker structure emulates the behavior of the hardware page
@@ -71,14 +73,9 @@ struct guest_walker {
u32 error_code;
};
-static gfn_t gpte_to_gfn(pt_element_t gpte)
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
{
- return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
-{
- return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
}
static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
@@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
gpa_t pte_gpa;
int rsvd_fault = 0;
- pgprintk("%s: addr %lx\n", __func__, addr);
+ trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
+ fetch_fault);
walk:
walker->level = vcpu->arch.mmu.root_level;
pte = vcpu->arch.cr3;
#if PTTYPE == 64
if (!is_long_mode(vcpu)) {
- pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
- if (!is_present_pte(pte))
+ pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+ trace_kvm_mmu_paging_element(pte, walker->level);
+ if (!is_present_gpte(pte))
goto not_present;
--walker->level;
}
@@ -150,12 +149,11 @@ walk:
pte_gpa += index * sizeof(pt_element_t);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- pgprintk("%s: table_gfn[%d] %lx\n", __func__,
- walker->level - 1, table_gfn);
kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+ trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_pte(pte))
+ if (!is_present_gpte(pte))
goto not_present;
rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
@@ -175,6 +173,8 @@ walk:
#endif
if (!(pte & PT_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index,
+ sizeof(pte));
mark_page_dirty(vcpu->kvm, table_gfn);
if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
index, pte, pte|PT_ACCESSED_MASK))
@@ -186,18 +186,24 @@ walk:
walker->ptes[walker->level - 1] = pte;
- if (walker->level == PT_PAGE_TABLE_LEVEL) {
- walker->gfn = gpte_to_gfn(pte);
- break;
- }
-
- if (walker->level == PT_DIRECTORY_LEVEL
- && (pte & PT_PAGE_SIZE_MASK)
- && (PTTYPE == 64 || is_pse(vcpu))) {
- walker->gfn = gpte_to_gfn_pde(pte);
- walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
- if (PTTYPE == 32 && is_cpuid_PSE36())
+ if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
+ ((walker->level == PT_DIRECTORY_LEVEL) &&
+ (pte & PT_PAGE_SIZE_MASK) &&
+ (PTTYPE == 64 || is_pse(vcpu))) ||
+ ((walker->level == PT_PDPE_LEVEL) &&
+ (pte & PT_PAGE_SIZE_MASK) &&
+ is_long_mode(vcpu))) {
+ int lvl = walker->level;
+
+ walker->gfn = gpte_to_gfn_lvl(pte, lvl);
+ walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
+ >> PAGE_SHIFT;
+
+ if (PTTYPE == 32 &&
+ walker->level == PT_DIRECTORY_LEVEL &&
+ is_cpuid_PSE36())
walker->gfn += pse36_gfn_delta(pte);
+
break;
}
@@ -205,9 +211,10 @@ walk:
--walker->level;
}
- if (write_fault && !is_dirty_pte(pte)) {
+ if (write_fault && !is_dirty_gpte(pte)) {
bool ret;
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
@@ -239,6 +246,7 @@ err:
walker->error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
walker->error_code |= PFERR_RSVD_MASK;
+ trace_kvm_mmu_walker_error(walker->error_code);
return 0;
}
@@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
- int largepage = vcpu->arch.update_pte.largepage;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_pte(gpte))
- set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+ if (!is_present_gpte(gpte))
+ __set_spte(spte, shadow_notrap_nonpresent_pte);
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -266,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
return;
kvm_get_pfn(pfn);
+ /*
+ * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+ */
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, largepage,
- gpte_to_gfn(gpte), pfn, true);
+ gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
+ gpte_to_gfn(gpte), pfn, true, true);
}
/*
@@ -276,7 +287,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
- int user_fault, int write_fault, int largepage,
+ int user_fault, int write_fault, int hlevel,
int *ptwrite, pfn_t pfn)
{
unsigned access = gw->pt_access;
@@ -289,20 +300,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
pt_element_t curr_pte;
struct kvm_shadow_walk_iterator iterator;
- if (!is_present_pte(gw->ptes[gw->level - 1]))
+ if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
for_each_shadow_entry(vcpu, addr, iterator) {
level = iterator.level;
sptep = iterator.sptep;
- if (level == PT_PAGE_TABLE_LEVEL
- || (largepage && level == PT_DIRECTORY_LEVEL)) {
+ if (iterator.level == hlevel) {
mmu_set_spte(vcpu, sptep, access,
gw->pte_access & access,
user_fault, write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- ptwrite, largepage,
- gw->gfn, pfn, false);
+ ptwrite, level,
+ gw->gfn, pfn, false, true);
break;
}
@@ -311,16 +321,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (is_large_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
- set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
}
- if (level == PT_DIRECTORY_LEVEL
- && gw->level == PT_DIRECTORY_LEVEL) {
+ if (level <= gw->level) {
+ int delta = level - gw->level + 1;
direct = 1;
- if (!is_dirty_pte(gw->ptes[level - 1]))
+ if (!is_dirty_gpte(gw->ptes[level - delta]))
access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+ table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
+ /* advance table_gfn when emulating 1gb pages with 4k */
+ if (delta == 0)
+ table_gfn += PT_INDEX(addr, level);
} else {
direct = 0;
table_gfn = gw->table_gfn[level - 2];
@@ -369,11 +382,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int user_fault = error_code & PFERR_USER_MASK;
int fetch_fault = error_code & PFERR_FETCH_MASK;
struct guest_walker walker;
- u64 *shadow_pte;
+ u64 *sptep;
int write_pt = 0;
int r;
pfn_t pfn;
- int largepage = 0;
+ int level = PT_PAGE_TABLE_LEVEL;
unsigned long mmu_seq;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -399,14 +412,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0;
}
- if (walker.level == PT_DIRECTORY_LEVEL) {
- gfn_t large_gfn;
- large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
- if (is_largepage_backed(vcpu, large_gfn)) {
- walker.gfn = large_gfn;
- largepage = 1;
- }
+ if (walker.level >= PT_DIRECTORY_LEVEL) {
+ level = min(walker.level, mapping_level(vcpu, walker.gfn));
+ walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
@@ -422,11 +432,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- largepage, &write_pt, pfn);
-
+ sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ level, &write_pt, pfn);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- shadow_pte, *shadow_pte, write_pt);
+ sptep, *sptep, write_pt);
if (!write_pt)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
@@ -459,8 +468,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
sptep = iterator.sptep;
/* FIXME: properly handle invlpg on large guest pages */
- if (level == PT_PAGE_TABLE_LEVEL ||
- ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+ if (level == PT_PAGE_TABLE_LEVEL ||
+ ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
+ ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
struct kvm_mmu_page *sp = page_header(__pa(sptep));
pte_gpa = (sp->gfn << PAGE_SHIFT);
@@ -472,7 +482,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
--vcpu->kvm->stat.lpages;
need_flush = 1;
}
- set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
break;
}
@@ -489,7 +499,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
sizeof(pt_element_t)))
return;
- if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+ if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
if (mmu_topup_memory_caches(vcpu))
return;
kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
@@ -536,7 +546,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
for (j = 0; j < ARRAY_SIZE(pt); ++j)
- if (r || is_present_pte(pt[j]))
+ if (r || is_present_gpte(pt[j]))
sp->spt[i+j] = shadow_trap_nonpresent_pte;
else
sp->spt[i+j] = shadow_notrap_nonpresent_pte;
@@ -552,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
+ bool reset_host_protection;
offset = nr_present = 0;
@@ -574,24 +585,31 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+ if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
!(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
rmap_remove(vcpu->kvm, &sp->spt[i]);
- if (is_present_pte(gpte))
+ if (is_present_gpte(gpte))
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
- set_shadow_pte(&sp->spt[i], nonpresent);
+ __set_spte(&sp->spt[i], nonpresent);
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
+ pte_access &= ~ACC_WRITE_MASK;
+ reset_host_protection = 0;
+ } else {
+ reset_host_protection = 1;
+ }
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_pte(gpte), 0, gfn,
- spte_to_pfn(sp->spt[i]), true, false);
+ is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+ spte_to_pfn(sp->spt[i]), true, false,
+ reset_host_protection);
}
return !nr_present;
@@ -603,9 +621,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
#undef PT_LEVEL_BITS
#undef PT_MAX_FULL_LEVELS
#undef gpte_to_gfn
-#undef gpte_to_gfn_pde
+#undef gpte_to_gfn_lvl
#undef CMPXCHG
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b1f658ad2f0..c17404add91 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -15,7 +15,6 @@
*/
#include <linux/kvm_host.h>
-#include "kvm_svm.h"
#include "irq.h"
#include "mmu.h"
#include "kvm_cache_regs.h"
@@ -26,10 +25,12 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/sched.h>
+#include <linux/ftrace_event.h>
#include <asm/desc.h>
#include <asm/virtext.h>
+#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -46,6 +47,10 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
/* Turn on to get debugging output*/
@@ -57,6 +62,58 @@ MODULE_LICENSE("GPL");
#define nsvm_printk(fmt, args...) do {} while(0)
#endif
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+ MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+ MSR_FS_BASE,
+#endif
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+struct kvm_vcpu;
+
+struct nested_state {
+ struct vmcb *hsave;
+ u64 hsave_msr;
+ u64 vmcb;
+
+ /* These are the merged vectors */
+ u32 *msrpm;
+
+ /* gpa pointers to the real vectors */
+ u64 vmcb_msrpm;
+
+ /* cache for intercepts of the guest */
+ u16 intercept_cr_read;
+ u16 intercept_cr_write;
+ u16 intercept_dr_read;
+ u16 intercept_dr_write;
+ u32 intercept_exceptions;
+ u64 intercept;
+
+};
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ struct vmcb *vmcb;
+ unsigned long vmcb_pa;
+ struct svm_cpu_data *svm_data;
+ uint64_t asid_generation;
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+
+ u64 next_rip;
+
+ u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+ u64 host_gs_base;
+
+ u32 *msrpm;
+
+ struct nested_state nested;
+};
+
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static bool npt_enabled = true;
@@ -67,15 +124,14 @@ static int npt = 1;
module_param(npt, int, S_IRUGO);
-static int nested = 0;
+static int nested = 1;
module_param(nested, int, S_IRUGO);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
+static void svm_complete_interrupts(struct vcpu_svm *svm);
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
+static int nested_svm_exit_handled(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
@@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
static inline bool is_nested(struct vcpu_svm *svm)
{
- return svm->nested_vmcb;
+ return svm->nested.vmcb;
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
}
static unsigned long iopm_base;
@@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
}
-static inline unsigned long kvm_read_cr2(void)
-{
- unsigned long cr2;
-
- asm volatile ("mov %%cr2, %0" : "=r" (cr2));
- return cr2;
-}
-
-static inline void kvm_write_cr2(unsigned long val)
-{
- asm volatile ("mov %0, %%cr2" :: "r" (val));
-}
-
static inline void force_new_asid(struct kvm_vcpu *vcpu)
{
to_svm(vcpu)->asid_generation--;
@@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage)
struct svm_cpu_data *svm_data;
uint64_t efer;
- struct desc_ptr gdt_descr;
+ struct descriptor_table gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage)
svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
svm_data->next_asid = svm_data->max_asid + 1;
- asm volatile ("sgdt %0" : "=m"(gdt_descr));
- gdt = (struct desc_struct *)gdt_descr.address;
+ kvm_get_gdt(&gdt_descr);
+ gdt = (struct desc_struct *)gdt_descr.base;
svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
rdmsrl(MSR_EFER, efer);
@@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
#endif
set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
}
static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm)
}
force_new_asid(&svm->vcpu);
- svm->nested_vmcb = 0;
- svm->vcpu.arch.hflags = HF_GIF_MASK;
+ svm->nested.vmcb = 0;
+ svm->vcpu.arch.hflags = 0;
+
+ enable_gif(svm);
}
static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
init_vmcb(svm);
- if (vcpu->vcpu_id != 0) {
+ if (!kvm_vcpu_is_bsp(vcpu)) {
kvm_rip_write(vcpu, 0);
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
@@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
hsave_page = alloc_page(GFP_KERNEL);
if (!hsave_page)
goto uninit;
- svm->hsave = page_address(hsave_page);
+ svm->nested.hsave = page_address(hsave_page);
- svm->nested_msrpm = page_address(nested_msrpm_pages);
+ svm->nested.msrpm = page_address(nested_msrpm_pages);
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
@@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
fx_init(&svm->vcpu);
svm->vcpu.fpu_active = 1;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (svm->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
@@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
- __free_page(virt_to_page(svm->hsave));
- __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(virt_to_page(svm->nested.hsave));
+ __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
}
@@ -709,6 +767,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdtscll(tsc_this);
delta = vcpu->arch.host_tsc - tsc_this;
svm->vmcb->control.tsc_offset += delta;
+ if (is_nested(svm))
+ svm->nested.hsave->control.tsc_offset += delta;
vcpu->cpu = cpu;
kvm_migrate_timers(vcpu);
svm->asid_generation = 0;
@@ -740,6 +800,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ BUG_ON(!npt_enabled);
+ load_pdptrs(vcpu, vcpu->arch.cr3);
+ break;
+ default:
+ BUG();
+ }
+}
+
static void svm_set_vintr(struct vcpu_svm *svm)
{
svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
@@ -1061,7 +1133,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
val = 0;
}
- KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
return val;
}
@@ -1070,8 +1141,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
{
struct vcpu_svm *svm = to_svm(vcpu);
- KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
-
*exception = 0;
switch (dr) {
@@ -1119,25 +1188,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
fault_address = svm->vmcb->control.exit_info_2;
error_code = svm->vmcb->control.exit_info_1;
- if (!npt_enabled)
- KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
- (u32)fault_address, (u32)(fault_address >> 32),
- handler);
- else
- KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
- (u32)fault_address, (u32)(fault_address >> 32),
- handler);
- /*
- * FIXME: Tis shouldn't be necessary here, but there is a flush
- * missing in the MMU code. Until we find this bug, flush the
- * complete TLB here on an NPF
- */
- if (npt_enabled)
- svm_flush_tlb(&svm->vcpu);
- else {
- if (kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- }
+ trace_kvm_page_fault(fault_address, error_code);
+ if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
@@ -1253,14 +1306,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- KVMTRACE_0D(NMI, &svm->vcpu, handler);
return 1;
}
static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
++svm->vcpu.stat.irq_exits;
- KVMTRACE_0D(INTR, &svm->vcpu, handler);
return 1;
}
@@ -1303,44 +1354,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code)
{
- if (is_nested(svm)) {
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = error_code;
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
- if (nested_svm_exit_handled(svm, false)) {
- nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
-
- nested_svm_vmexit(svm);
- return 1;
- }
- }
+ if (!is_nested(svm))
+ return 0;
- return 0;
+ svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = error_code;
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+ return nested_svm_exit_handled(svm);
}
static inline int nested_svm_intr(struct vcpu_svm *svm)
{
- if (is_nested(svm)) {
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return 0;
+ if (!is_nested(svm))
+ return 0;
- if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return 0;
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ return 0;
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+ return 0;
- if (nested_svm_exit_handled(svm, false)) {
- nsvm_printk("VMexit -> INTR\n");
- nested_svm_vmexit(svm);
- return 1;
- }
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+
+ if (nested_svm_exit_handled(svm)) {
+ nsvm_printk("VMexit -> INTR\n");
+ return 1;
}
return 0;
}
-static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
{
struct page *page;
@@ -1348,236 +1394,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
up_read(&current->mm->mmap_sem);
- if (is_error_page(page)) {
- printk(KERN_INFO "%s: could not find page at 0x%llx\n",
- __func__, gpa);
- kvm_release_page_clean(page);
- kvm_inject_gp(&svm->vcpu, 0);
- return NULL;
- }
- return page;
+ if (is_error_page(page))
+ goto error;
+
+ return kmap_atomic(page, idx);
+
+error:
+ kvm_release_page_clean(page);
+ kvm_inject_gp(&svm->vcpu, 0);
+
+ return NULL;
}
-static int nested_svm_do(struct vcpu_svm *svm,
- u64 arg1_gpa, u64 arg2_gpa, void *opaque,
- int (*handler)(struct vcpu_svm *svm,
- void *arg1,
- void *arg2,
- void *opaque))
+static void nested_svm_unmap(void *addr, enum km_type idx)
{
- struct page *arg1_page;
- struct page *arg2_page = NULL;
- void *arg1;
- void *arg2 = NULL;
- int retval;
+ struct page *page;
- arg1_page = nested_svm_get_page(svm, arg1_gpa);
- if(arg1_page == NULL)
- return 1;
+ if (!addr)
+ return;
- if (arg2_gpa) {
- arg2_page = nested_svm_get_page(svm, arg2_gpa);
- if(arg2_page == NULL) {
- kvm_release_page_clean(arg1_page);
- return 1;
- }
- }
+ page = kmap_atomic_to_page(addr);
- arg1 = kmap_atomic(arg1_page, KM_USER0);
- if (arg2_gpa)
- arg2 = kmap_atomic(arg2_page, KM_USER1);
+ kunmap_atomic(addr, idx);
+ kvm_release_page_dirty(page);
+}
- retval = handler(svm, arg1, arg2, opaque);
+static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ u32 param = svm->vmcb->control.exit_info_1 & 1;
+ u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ bool ret = false;
+ u32 t0, t1;
+ u8 *msrpm;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return false;
- kunmap_atomic(arg1, KM_USER0);
- if (arg2_gpa)
- kunmap_atomic(arg2, KM_USER1);
+ msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
- kvm_release_page_dirty(arg1_page);
- if (arg2_gpa)
- kvm_release_page_dirty(arg2_page);
+ if (!msrpm)
+ goto out;
- return retval;
+ switch (msr) {
+ case 0 ... 0x1fff:
+ t0 = (msr * 2) % 8;
+ t1 = msr / 8;
+ break;
+ case 0xc0000000 ... 0xc0001fff:
+ t0 = (8192 + msr - 0xc0000000) * 2;
+ t1 = (t0 / 8);
+ t0 %= 8;
+ break;
+ case 0xc0010000 ... 0xc0011fff:
+ t0 = (16384 + msr - 0xc0010000) * 2;
+ t1 = (t0 / 8);
+ t0 %= 8;
+ break;
+ default:
+ ret = true;
+ goto out;
+ }
+
+ ret = msrpm[t1] & ((1 << param) << t0);
+
+out:
+ nested_svm_unmap(msrpm, KM_USER0);
+
+ return ret;
}
-static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
- void *arg1,
- void *arg2,
- void *opaque)
+static int nested_svm_exit_special(struct vcpu_svm *svm)
{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- bool kvm_overrides = *(bool *)opaque;
u32 exit_code = svm->vmcb->control.exit_code;
- if (kvm_overrides) {
- switch (exit_code) {
- case SVM_EXIT_INTR:
- case SVM_EXIT_NMI:
- return 0;
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ return NESTED_EXIT_HOST;
/* For now we are always handling NPFs when using them */
- case SVM_EXIT_NPF:
- if (npt_enabled)
- return 0;
- break;
- /* When we're shadowing, trap PFs */
- case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- if (!npt_enabled)
- return 0;
- break;
- default:
- break;
- }
+ case SVM_EXIT_NPF:
+ if (npt_enabled)
+ return NESTED_EXIT_HOST;
+ break;
+ /* When we're shadowing, trap PFs */
+ case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+ if (!npt_enabled)
+ return NESTED_EXIT_HOST;
+ break;
+ default:
+ break;
}
+ return NESTED_EXIT_CONTINUE;
+}
+
+/*
+ * If this function returns true, this #vmexit was already handled
+ */
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
- if (nested_vmcb->control.intercept_cr_read & cr_bits)
- return 1;
+ if (svm->nested.intercept_cr_read & cr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
- if (nested_vmcb->control.intercept_cr_write & cr_bits)
- return 1;
+ if (svm->nested.intercept_cr_write & cr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
- if (nested_vmcb->control.intercept_dr_read & dr_bits)
- return 1;
+ if (svm->nested.intercept_dr_read & dr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
- if (nested_vmcb->control.intercept_dr_write & dr_bits)
- return 1;
+ if (svm->nested.intercept_dr_write & dr_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (nested_vmcb->control.intercept_exceptions & excp_bits)
- return 1;
+ if (svm->nested.intercept_exceptions & excp_bits)
+ vmexit = NESTED_EXIT_DONE;
break;
}
default: {
u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
nsvm_printk("exit code: 0x%x\n", exit_code);
- if (nested_vmcb->control.intercept & exit_bits)
- return 1;
+ if (svm->nested.intercept & exit_bits)
+ vmexit = NESTED_EXIT_DONE;
}
}
- return 0;
-}
-
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
- void *arg1, void *arg2,
- void *opaque)
-{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- u8 *msrpm = (u8 *)arg2;
- u32 t0, t1;
- u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u32 param = svm->vmcb->control.exit_info_1 & 1;
-
- if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return 0;
-
- switch(msr) {
- case 0 ... 0x1fff:
- t0 = (msr * 2) % 8;
- t1 = msr / 8;
- break;
- case 0xc0000000 ... 0xc0001fff:
- t0 = (8192 + msr - 0xc0000000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- case 0xc0010000 ... 0xc0011fff:
- t0 = (16384 + msr - 0xc0010000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- default:
- return 1;
- break;
+ if (vmexit == NESTED_EXIT_DONE) {
+ nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
+ nested_svm_vmexit(svm);
}
- if (msrpm[t1] & ((1 << param) << t0))
- return 1;
- return 0;
+ return vmexit;
+}
+
+static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+{
+ struct vmcb_control_area *dst = &dst_vmcb->control;
+ struct vmcb_control_area *from = &from_vmcb->control;
+
+ dst->intercept_cr_read = from->intercept_cr_read;
+ dst->intercept_cr_write = from->intercept_cr_write;
+ dst->intercept_dr_read = from->intercept_dr_read;
+ dst->intercept_dr_write = from->intercept_dr_write;
+ dst->intercept_exceptions = from->intercept_exceptions;
+ dst->intercept = from->intercept;
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->lbr_ctl = from->lbr_ctl;
}
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
+static int nested_svm_vmexit(struct vcpu_svm *svm)
{
- bool k = kvm_override;
-
- switch (svm->vmcb->control.exit_code) {
- case SVM_EXIT_MSR:
- return nested_svm_do(svm, svm->nested_vmcb,
- svm->nested_vmcb_msrpm, NULL,
- nested_svm_exit_handled_msr);
- default: break;
- }
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
- return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
- nested_svm_exit_handled_real);
-}
-
-static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
-{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- struct vmcb *hsave = svm->hsave;
- u64 nested_save[] = { nested_vmcb->save.cr0,
- nested_vmcb->save.cr3,
- nested_vmcb->save.cr4,
- nested_vmcb->save.efer,
- nested_vmcb->control.intercept_cr_read,
- nested_vmcb->control.intercept_cr_write,
- nested_vmcb->control.intercept_dr_read,
- nested_vmcb->control.intercept_dr_write,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept,
- nested_vmcb->control.msrpm_base_pa,
- nested_vmcb->control.iopm_base_pa,
- nested_vmcb->control.tsc_offset };
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+ if (!nested_vmcb)
+ return 1;
/* Give the current vmcb to the guest */
- memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
- nested_vmcb->save.cr0 = nested_save[0];
- if (!npt_enabled)
- nested_vmcb->save.cr3 = nested_save[1];
- nested_vmcb->save.cr4 = nested_save[2];
- nested_vmcb->save.efer = nested_save[3];
- nested_vmcb->control.intercept_cr_read = nested_save[4];
- nested_vmcb->control.intercept_cr_write = nested_save[5];
- nested_vmcb->control.intercept_dr_read = nested_save[6];
- nested_vmcb->control.intercept_dr_write = nested_save[7];
- nested_vmcb->control.intercept_exceptions = nested_save[8];
- nested_vmcb->control.intercept = nested_save[9];
- nested_vmcb->control.msrpm_base_pa = nested_save[10];
- nested_vmcb->control.iopm_base_pa = nested_save[11];
- nested_vmcb->control.tsc_offset = nested_save[12];
+ disable_gif(svm);
+
+ nested_vmcb->save.es = vmcb->save.es;
+ nested_vmcb->save.cs = vmcb->save.cs;
+ nested_vmcb->save.ss = vmcb->save.ss;
+ nested_vmcb->save.ds = vmcb->save.ds;
+ nested_vmcb->save.gdtr = vmcb->save.gdtr;
+ nested_vmcb->save.idtr = vmcb->save.idtr;
+ if (npt_enabled)
+ nested_vmcb->save.cr3 = vmcb->save.cr3;
+ nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.rflags = vmcb->save.rflags;
+ nested_vmcb->save.rip = vmcb->save.rip;
+ nested_vmcb->save.rsp = vmcb->save.rsp;
+ nested_vmcb->save.rax = vmcb->save.rax;
+ nested_vmcb->save.dr7 = vmcb->save.dr7;
+ nested_vmcb->save.dr6 = vmcb->save.dr6;
+ nested_vmcb->save.cpl = vmcb->save.cpl;
+
+ nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
+ nested_vmcb->control.int_vector = vmcb->control.int_vector;
+ nested_vmcb->control.int_state = vmcb->control.int_state;
+ nested_vmcb->control.exit_code = vmcb->control.exit_code;
+ nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
+ nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
+ nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
+ nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
+ nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+ nested_vmcb->control.tlb_ctl = 0;
+ nested_vmcb->control.event_inj = 0;
+ nested_vmcb->control.event_inj_err = 0;
/* We always set V_INTR_MASKING and remember the old value in hflags */
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
- if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
- (nested_vmcb->control.int_vector)) {
- nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
- nested_vmcb->control.int_vector);
- }
-
/* Restore the original control entries */
- svm->vmcb->control = hsave->control;
+ copy_vmcb_control_area(vmcb, hsave);
/* Kill any pending exceptions */
if (svm->vcpu.arch.exception.pending == true)
nsvm_printk("WARNING: Pending Exception\n");
- svm->vcpu.arch.exception.pending = false;
+
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
/* Restore selected save entries */
svm->vmcb->save.es = hsave->save.es;
@@ -1603,19 +1659,10 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
/* Exit nested SVM mode */
- svm->nested_vmcb = 0;
-
- return 0;
-}
+ svm->nested.vmcb = 0;
-static int nested_svm_vmexit(struct vcpu_svm *svm)
-{
- nsvm_printk("VMexit\n");
- if (nested_svm_do(svm, svm->nested_vmcb, 0,
- NULL, nested_svm_vmexit_real))
- return 1;
+ nested_svm_unmap(nested_vmcb, KM_USER0);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
@@ -1623,38 +1670,63 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
return 0;
}
-static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
+ u32 *nested_msrpm;
int i;
- u32 *nested_msrpm = (u32*)arg1;
+
+ nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+ if (!nested_msrpm)
+ return false;
+
for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
- svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
+ svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
- return 0;
+ svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+
+ nested_svm_unmap(nested_msrpm, KM_USER0);
+
+ return true;
}
-static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- struct vmcb *hsave = svm->hsave;
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ if (!nested_vmcb)
+ return false;
/* nested_vmcb is our indicator if nested SVM is activated */
- svm->nested_vmcb = svm->vmcb->save.rax;
+ svm->nested.vmcb = svm->vmcb->save.rax;
/* Clear internal status */
- svm->vcpu.arch.exception.pending = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
/* Save the old vmcb, so we don't need to pick what we save, but
can restore everything when a VMEXIT occurs */
- memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
- /* We need to remember the original CR3 in the SPT case */
- if (!npt_enabled)
- hsave->save.cr3 = svm->vcpu.arch.cr3;
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rip = svm->next_rip;
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.shadow_efer;
+ hsave->save.cr0 = svm->vcpu.arch.cr0;
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = vmcb->save.rflags;
+ hsave->save.rip = svm->next_rip;
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = svm->vcpu.arch.cr3;
+
+ copy_vmcb_control_area(hsave, vmcb);
if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -1679,7 +1751,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
kvm_mmu_reset_context(&svm->vcpu);
}
- svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
+ svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
@@ -1706,7 +1778,15 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
- svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+
+ /* cache intercepts */
+ svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
+ svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
+ svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
+ svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
+ svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
+ svm->nested.intercept = nested_vmcb->control.intercept;
force_new_asid(&svm->vcpu);
svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
@@ -1734,12 +1814,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
+ nested_svm_unmap(nested_vmcb, KM_USER0);
- return 0;
+ enable_gif(svm);
+
+ return true;
}
-static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
{
to_vmcb->save.fs = from_vmcb->save.fs;
to_vmcb->save.gs = from_vmcb->save.gs;
@@ -1753,44 +1835,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
-
- return 1;
-}
-
-static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque)
-{
- return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
-}
-
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque)
-{
- return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
}
static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
+ struct vmcb *nested_vmcb;
+
if (nested_svm_check_permissions(svm))
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ if (!nested_vmcb)
+ return 1;
+
+ nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
+ nested_svm_unmap(nested_vmcb, KM_USER0);
return 1;
}
static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
+ struct vmcb *nested_vmcb;
+
if (nested_svm_check_permissions(svm))
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ if (!nested_vmcb)
+ return 1;
+
+ nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
+ nested_svm_unmap(nested_vmcb, KM_USER0);
return 1;
}
@@ -1798,19 +1880,29 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
nsvm_printk("VMrun\n");
+
if (nested_svm_check_permissions(svm))
return 1;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
- NULL, nested_svm_vmrun))
+ if (!nested_svm_vmrun(svm))
return 1;
- if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
- NULL, nested_svm_vmrun_msrpm))
- return 1;
+ if (!nested_svm_vmrun_msrpm(svm))
+ goto failed;
+
+ return 1;
+
+failed:
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
return 1;
}
@@ -1823,7 +1915,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
+ enable_gif(svm);
return 1;
}
@@ -1836,7 +1928,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+ disable_gif(svm);
/* After a CLGI no interrupts should come */
svm_clear_vintr(svm);
@@ -1845,6 +1937,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
+static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ nsvm_printk("INVLPGA\n");
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
+}
+
static int invalid_op_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
@@ -1953,11 +2058,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
struct vcpu_svm *svm = to_svm(vcpu);
switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
- u64 tsc;
+ case MSR_IA32_TSC: {
+ u64 tsc_offset;
+
+ if (is_nested(svm))
+ tsc_offset = svm->nested.hsave->control.tsc_offset;
+ else
+ tsc_offset = svm->vmcb->control.tsc_offset;
- rdtscll(tsc);
- *data = svm->vmcb->control.tsc_offset + tsc;
+ *data = tsc_offset + native_read_tsc();
break;
}
case MSR_K6_STAR:
@@ -1981,10 +2090,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = svm->vmcb->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- *data = svm->vmcb->save.sysenter_eip;
+ *data = svm->sysenter_eip;
break;
case MSR_IA32_SYSENTER_ESP:
- *data = svm->vmcb->save.sysenter_esp;
+ *data = svm->sysenter_esp;
break;
/* Nobody will change the following 5 values in the VMCB so
we can safely return them on rdmsr. They will always be 0
@@ -2005,7 +2114,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = svm->vmcb->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
- *data = svm->hsave_msr;
+ *data = svm->nested.hsave_msr;
break;
case MSR_VM_CR:
*data = 0;
@@ -2027,8 +2136,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
if (svm_get_msr(&svm->vcpu, ecx, &data))
kvm_inject_gp(&svm->vcpu, 0);
else {
- KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
- (u32)(data >> 32), handler);
+ trace_kvm_msr_read(ecx, data);
svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
@@ -2043,11 +2151,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
struct vcpu_svm *svm = to_svm(vcpu);
switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
- u64 tsc;
+ case MSR_IA32_TSC: {
+ u64 tsc_offset = data - native_read_tsc();
+ u64 g_tsc_offset = 0;
+
+ if (is_nested(svm)) {
+ g_tsc_offset = svm->vmcb->control.tsc_offset -
+ svm->nested.hsave->control.tsc_offset;
+ svm->nested.hsave->control.tsc_offset = tsc_offset;
+ }
+
+ svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
- rdtscll(tsc);
- svm->vmcb->control.tsc_offset = data - tsc;
break;
}
case MSR_K6_STAR:
@@ -2071,9 +2186,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->vmcb->save.sysenter_cs = data;
break;
case MSR_IA32_SYSENTER_EIP:
+ svm->sysenter_eip = data;
svm->vmcb->save.sysenter_eip = data;
break;
case MSR_IA32_SYSENTER_ESP:
+ svm->sysenter_esp = data;
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
@@ -2091,24 +2208,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
else
svm_disable_lbrv(svm);
break;
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
- /*
- * Just discard all writes to the performance counters; this
- * should keep both older linux and windows 64-bit guests
- * happy
- */
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
-
- break;
case MSR_VM_HSAVE_PA:
- svm->hsave_msr = data;
+ svm->nested.hsave_msr = data;
+ break;
+ case MSR_VM_CR:
+ case MSR_VM_IGNNE:
+ pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
default:
return kvm_set_msr_common(vcpu, ecx, data);
@@ -2122,8 +2227,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_write(ecx, data);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
if (svm_set_msr(&svm->vcpu, ecx, data))
@@ -2144,8 +2248,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int interrupt_window_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
- KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
-
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
/*
@@ -2201,7 +2303,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_INVD] = emulate_on_interception,
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
- [SVM_EXIT_INVLPGA] = invalid_op_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
[SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
@@ -2224,20 +2326,26 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u32 exit_code = svm->vmcb->control.exit_code;
- KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
- (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
+ trace_kvm_exit(exit_code, svm->vmcb->save.rip);
if (is_nested(svm)) {
+ int vmexit;
+
nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
exit_code, svm->vmcb->control.exit_info_1,
svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
- if (nested_svm_exit_handled(svm, true)) {
- nested_svm_vmexit(svm);
- nsvm_printk("-> #VMEXIT\n");
+
+ vmexit = nested_svm_exit_special(svm);
+
+ if (vmexit == NESTED_EXIT_CONTINUE)
+ vmexit = nested_svm_exit_handled(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
return 1;
- }
}
+ svm_complete_interrupts(svm);
+
if (npt_enabled) {
int mmu_reload = 0;
if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -2246,12 +2354,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
vcpu->arch.cr0 = svm->vmcb->save.cr0;
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- }
if (mmu_reload) {
kvm_mmu_reset_context(vcpu);
kvm_mmu_load(vcpu);
@@ -2319,7 +2421,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
struct vmcb_control_area *control;
- KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
+ trace_kvm_inj_virq(irq);
++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
@@ -2329,21 +2431,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
}
-static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.event_inj = nr |
- SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
-}
-
static void svm_set_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nested_svm_intr(svm);
+ BUG_ON(!(gif_set(svm)));
- svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
+ svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+ SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -2371,13 +2466,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
struct vmcb *vmcb = svm->vmcb;
return (vmcb->save.rflags & X86_EFLAGS_IF) &&
!(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vcpu.arch.hflags & HF_GIF_MASK);
+ gif_set(svm) &&
+ !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK));
}
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- svm_set_vintr(to_svm(vcpu));
- svm_inject_irq(to_svm(vcpu), 0x0);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_printk("Trying to open IRQ window\n");
+
+ nested_svm_intr(svm);
+
+ /* In case GIF=0 we can't rely on the CPU to tell us when
+ * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
+ * The next time we get that intercept, this function will be
+ * called again though and we'll get the vintr intercept. */
+ if (gif_set(svm)) {
+ svm_set_vintr(svm);
+ svm_inject_irq(svm, 0x0);
+ }
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -2456,6 +2563,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
case SVM_EXITINTINFO_TYPE_EXEPT:
/* In case of software exception do not reinject an exception
vector, but re-execute and instruction instead */
+ if (is_nested(svm))
+ break;
if (kvm_exception_is_soft(vector))
break;
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
@@ -2498,9 +2607,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
fs_selector = kvm_read_fs();
gs_selector = kvm_read_gs();
ldt_selector = kvm_read_ldt();
- svm->host_cr2 = kvm_read_cr2();
- if (!is_nested(svm))
- svm->vmcb->save.cr2 = vcpu->arch.cr2;
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
/* required for live migration with NPT */
if (npt_enabled)
svm->vmcb->save.cr3 = vcpu->arch.cr3;
@@ -2585,8 +2692,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- kvm_write_cr2(svm->host_cr2);
-
kvm_load_fs(fs_selector);
kvm_load_gs(gs_selector);
kvm_load_ldt(ldt_selector);
@@ -2602,7 +2707,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->next_rip = 0;
- svm_complete_interrupts(svm);
+ if (npt_enabled) {
+ vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
+ }
}
#undef R
@@ -2673,6 +2781,64 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return 0;
}
+static const struct trace_print_flags svm_exit_reasons_str[] = {
+ { SVM_EXIT_READ_CR0, "read_cr0" },
+ { SVM_EXIT_READ_CR3, "read_cr3" },
+ { SVM_EXIT_READ_CR4, "read_cr4" },
+ { SVM_EXIT_READ_CR8, "read_cr8" },
+ { SVM_EXIT_WRITE_CR0, "write_cr0" },
+ { SVM_EXIT_WRITE_CR3, "write_cr3" },
+ { SVM_EXIT_WRITE_CR4, "write_cr4" },
+ { SVM_EXIT_WRITE_CR8, "write_cr8" },
+ { SVM_EXIT_READ_DR0, "read_dr0" },
+ { SVM_EXIT_READ_DR1, "read_dr1" },
+ { SVM_EXIT_READ_DR2, "read_dr2" },
+ { SVM_EXIT_READ_DR3, "read_dr3" },
+ { SVM_EXIT_WRITE_DR0, "write_dr0" },
+ { SVM_EXIT_WRITE_DR1, "write_dr1" },
+ { SVM_EXIT_WRITE_DR2, "write_dr2" },
+ { SVM_EXIT_WRITE_DR3, "write_dr3" },
+ { SVM_EXIT_WRITE_DR5, "write_dr5" },
+ { SVM_EXIT_WRITE_DR7, "write_dr7" },
+ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
+ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
+ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
+ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
+ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
+ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
+ { SVM_EXIT_INTR, "interrupt" },
+ { SVM_EXIT_NMI, "nmi" },
+ { SVM_EXIT_SMI, "smi" },
+ { SVM_EXIT_INIT, "init" },
+ { SVM_EXIT_VINTR, "vintr" },
+ { SVM_EXIT_CPUID, "cpuid" },
+ { SVM_EXIT_INVD, "invd" },
+ { SVM_EXIT_HLT, "hlt" },
+ { SVM_EXIT_INVLPG, "invlpg" },
+ { SVM_EXIT_INVLPGA, "invlpga" },
+ { SVM_EXIT_IOIO, "io" },
+ { SVM_EXIT_MSR, "msr" },
+ { SVM_EXIT_TASK_SWITCH, "task_switch" },
+ { SVM_EXIT_SHUTDOWN, "shutdown" },
+ { SVM_EXIT_VMRUN, "vmrun" },
+ { SVM_EXIT_VMMCALL, "hypercall" },
+ { SVM_EXIT_VMLOAD, "vmload" },
+ { SVM_EXIT_VMSAVE, "vmsave" },
+ { SVM_EXIT_STGI, "stgi" },
+ { SVM_EXIT_CLGI, "clgi" },
+ { SVM_EXIT_SKINIT, "skinit" },
+ { SVM_EXIT_WBINVD, "wbinvd" },
+ { SVM_EXIT_MONITOR, "monitor" },
+ { SVM_EXIT_MWAIT, "mwait" },
+ { SVM_EXIT_NPF, "npf" },
+ { -1, NULL }
+};
+
+static bool svm_gb_page_enable(void)
+{
+ return true;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -2710,6 +2876,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_gdt = svm_set_gdt,
.get_dr = svm_get_dr,
.set_dr = svm_set_dr,
+ .cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
@@ -2733,6 +2900,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
+
+ .exit_reasons_str = svm_exit_reasons_str,
+ .gb_page_enable = svm_gb_page_enable,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 86dbac072d0..eea40439066 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
int restart_timer = 0;
wait_queue_head_t *q = &vcpu->wq;
- /* FIXME: this code should not know anything about vcpus */
- if (!atomic_inc_and_test(&ktimer->pending))
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially loosing timer events in the !reinject
+ * case anyway.
+ */
+ if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ /* FIXME: this code should not know anything about vcpus */
set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-
- if (!ktimer->reinject)
- atomic_set(&ktimer->pending, 1);
+ }
if (waitqueue_active(q))
wake_up_interruptible(q);
@@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
struct kvm_vcpu *vcpu;
struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
+ vcpu = ktimer->vcpu;
if (!vcpu)
return HRTIMER_NORESTART;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 00000000000..0d480e77eac
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,355 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %u", __entry->vcpu_id)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3),
+ TP_ARGS(nr, a0, a1, a2, a3),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr )
+ __field( unsigned long, a0 )
+ __field( unsigned long, a1 )
+ __field( unsigned long, a2 )
+ __field( unsigned long, a3 )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3)
+);
+
+/*
+ * Tracepoint for PIO.
+ */
+TRACE_EVENT(kvm_pio,
+ TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
+ unsigned int count),
+ TP_ARGS(rw, port, size, count),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, port )
+ __field( unsigned int, size )
+ __field( unsigned int, count )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->port = port;
+ __entry->size = size;
+ __entry->count = count;
+ ),
+
+ TP_printk("pio_%s at 0x%x size %d count %d",
+ __entry->rw ? "write" : "read",
+ __entry->port, __entry->size, __entry->count)
+);
+
+/*
+ * Tracepoint for cpuid.
+ */
+TRACE_EVENT(kvm_cpuid,
+ TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
+ unsigned long rcx, unsigned long rdx),
+ TP_ARGS(function, rax, rbx, rcx, rdx),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, function )
+ __field( unsigned long, rax )
+ __field( unsigned long, rbx )
+ __field( unsigned long, rcx )
+ __field( unsigned long, rdx )
+ ),
+
+ TP_fast_assign(
+ __entry->function = function;
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ ),
+
+ TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
+ __entry->function, __entry->rax,
+ __entry->rbx, __entry->rcx, __entry->rdx)
+);
+
+#define AREG(x) { APIC_##x, "APIC_" #x }
+
+#define kvm_trace_symbol_apic \
+ AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
+ AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
+ AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
+ AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
+ AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
+ AREG(ECTRL)
+/*
+ * Tracepoint for apic access.
+ */
+TRACE_EVENT(kvm_apic,
+ TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_ARGS(rw, reg, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, reg )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->reg = reg;
+ __entry->val = val;
+ ),
+
+ TP_printk("apic_%s %s = 0x%x",
+ __entry->rw ? "write" : "read",
+ __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
+ __entry->val)
+);
+
+#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
+#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+
+/*
+ * Tracepoint for kvm guest exit:
+ */
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
+ TP_ARGS(exit_reason, guest_rip),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, exit_reason )
+ __field( unsigned long, guest_rip )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_reason = exit_reason;
+ __entry->guest_rip = guest_rip;
+ ),
+
+ TP_printk("reason %s rip 0x%lx",
+ ftrace_print_symbols_seq(p, __entry->exit_reason,
+ kvm_x86_ops->exit_reasons_str),
+ __entry->guest_rip)
+);
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_virq,
+ TP_PROTO(unsigned int irq),
+ TP_ARGS(irq),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ ),
+
+ TP_printk("irq %u", __entry->irq)
+);
+
+/*
+ * Tracepoint for page fault.
+ */
+TRACE_EVENT(kvm_page_fault,
+ TP_PROTO(unsigned long fault_address, unsigned int error_code),
+ TP_ARGS(fault_address, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, fault_address )
+ __field( unsigned int, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->fault_address = fault_address;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address %lx error_code %x",
+ __entry->fault_address, __entry->error_code)
+);
+
+/*
+ * Tracepoint for guest MSR access.
+ */
+TRACE_EVENT(kvm_msr,
+ TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data),
+ TP_ARGS(rw, ecx, data),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, ecx )
+ __field( unsigned long, data )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ ),
+
+ TP_printk("msr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->ecx, __entry->data)
+);
+
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data)
+
+/*
+ * Tracepoint for guest CR access.
+ */
+TRACE_EVENT(kvm_cr,
+ TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
+ TP_ARGS(rw, cr, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, cr )
+ __field( unsigned long, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->cr = cr;
+ __entry->val = val;
+ ),
+
+ TP_printk("cr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->cr, __entry->val)
+);
+
+#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
+#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
+
+TRACE_EVENT(kvm_pic_set_irq,
+ TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
+ TP_ARGS(chip, pin, elcr, imr, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u8, chip )
+ __field( __u8, pin )
+ __field( __u8, elcr )
+ __field( __u8, imr )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->chip = chip;
+ __entry->pin = pin;
+ __entry->elcr = elcr;
+ __entry->imr = imr;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("chip %u pin %u (%s%s)%s",
+ __entry->chip, __entry->pin,
+ (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
+ (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#define kvm_apic_dst_shorthand \
+ {0x0, "dst"}, \
+ {0x1, "self"}, \
+ {0x2, "all"}, \
+ {0x3, "all-but-self"}
+
+TRACE_EVENT(kvm_apic_ipi,
+ TP_PROTO(__u32 icr_low, __u32 dest_id),
+ TP_ARGS(icr_low, dest_id),
+
+ TP_STRUCT__entry(
+ __field( __u32, icr_low )
+ __field( __u32, dest_id )
+ ),
+
+ TP_fast_assign(
+ __entry->icr_low = icr_low;
+ __entry->dest_id = dest_id;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
+ __entry->dest_id, (u8)__entry->icr_low,
+ __print_symbolic((__entry->icr_low >> 8 & 0x7),
+ kvm_deliver_mode),
+ (__entry->icr_low & (1<<11)) ? "logical" : "physical",
+ (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
+ (__entry->icr_low & (1<<15)) ? "level" : "edge",
+ __print_symbolic((__entry->icr_low >> 18 & 0x3),
+ kvm_apic_dst_shorthand))
+);
+
+TRACE_EVENT(kvm_apic_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
+ TP_ARGS(apicid, dm, tm, vec, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u8, tm )
+ __field( __u8, vec )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)%s",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29f912927a5..ed53b42caba 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -25,6 +25,7 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
+#include <linux/ftrace_event.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -34,6 +35,8 @@
#include <asm/virtext.h>
#include <asm/mce.h>
+#include "trace.h"
+
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
@@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
static int __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO);
+static int __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+ enable_unrestricted_guest, bool, S_IRUGO);
+
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
@@ -84,6 +91,14 @@ struct vcpu_vmx {
int guest_efer_loaded;
} host_state;
struct {
+ int vm86_active;
+ u8 save_iopl;
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } tr, es, ds, fs, gs;
struct {
bool pending;
u8 vector;
@@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+
/*
* Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
* away by decrementing the array size.
@@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void)
cpu_has_vmx_virtualize_apic_accesses();
}
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
+}
+
+static inline bool cpu_has_vmx_eptp_uncacheable(void)
+{
+ return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
+}
+
+static inline bool cpu_has_vmx_eptp_writeback(void)
+{
+ return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+}
+
static inline int cpu_has_vmx_invept_individual_addr(void)
{
return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void)
SECONDARY_EXEC_ENABLE_EPT;
}
+static inline int cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return flexpriority_enabled &&
@@ -497,14 +540,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
+ /*
+ * Unconditionally intercept #DB so we can maintain dr6 without
+ * reading it every exit.
+ */
+ eb |= 1u << DB_VECTOR;
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- eb |= 1u << DB_VECTOR;
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
eb |= 1u << BP_VECTOR;
}
- if (vcpu->arch.rmode.vm86_active)
+ if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -528,12 +573,15 @@ static void reload_tss(void)
static void load_transition_efer(struct vcpu_vmx *vmx)
{
int efer_offset = vmx->msr_offset_efer;
- u64 host_efer = vmx->host_msrs[efer_offset].data;
- u64 guest_efer = vmx->guest_msrs[efer_offset].data;
+ u64 host_efer;
+ u64 guest_efer;
u64 ignore_bits;
if (efer_offset < 0)
return;
+ host_efer = vmx->host_msrs[efer_offset].data;
+ guest_efer = vmx->guest_msrs[efer_offset].data;
+
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
* outside long mode
@@ -661,7 +709,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (vcpu->cpu != cpu) {
vcpu_clear(vmx);
kvm_migrate_timers(vcpu);
- vpid_sync_vcpu_all(vmx);
+ set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
local_irq_disable();
list_add(&vmx->local_vcpus_link,
&per_cpu(vcpus_on_cpu, cpu));
@@ -735,12 +783,17 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
- return vmcs_readl(GUEST_RFLAGS);
+ unsigned long rflags;
+
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active)
+ rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
+ return rflags;
}
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- if (vcpu->arch.rmode.vm86_active)
+ if (to_vmx(vcpu)->rmode.vm86_active)
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, rflags);
}
@@ -797,12 +850,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = nr;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- if (nr == BP_VECTOR || nr == OF_VECTOR)
- vmx->rmode.irq.rip++;
+ if (kvm_exception_is_soft(nr))
+ vmx->rmode.irq.rip +=
+ vmx->vcpu.arch.event_exit_inst_len;
intr_info |= INTR_TYPE_SOFT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -940,7 +994,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_index, pdata);
#endif
- case MSR_IA32_TIME_STAMP_COUNTER:
+ case MSR_IA32_TSC:
data = guest_read_tsc();
break;
case MSR_IA32_SYSENTER_CS:
@@ -953,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
default:
- vmx_load_host_state(to_vmx(vcpu));
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
+ vmx_load_host_state(to_vmx(vcpu));
data = msr->data;
break;
}
@@ -1000,22 +1054,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
case MSR_IA32_SYSENTER_ESP:
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_TIME_STAMP_COUNTER:
+ case MSR_IA32_TSC:
rdtscll(host_tsc);
guest_write_tsc(data, host_tsc);
break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- /*
- * Just discard all writes to the performance counters; this
- * should keep both older linux and windows 64-bit guests
- * happy
- */
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
-
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, data);
@@ -1024,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
}
/* Otherwise falls through to kvm_set_msr_common */
default:
- vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
if (msr) {
+ vmx_load_host_state(vmx);
msr->data = data;
break;
}
@@ -1046,6 +1088,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
case VCPU_REGS_RIP:
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
break;
+ case VCPU_EXREG_PDPTR:
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
+ break;
default:
break;
}
@@ -1203,7 +1249,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
- SECONDARY_EXEC_ENABLE_EPT;
+ SECONDARY_EXEC_ENABLE_EPT |
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -1217,12 +1264,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
- min &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_INVLPG_EXITING);
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
- return -EIO;
+ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
vmx_capability.ept, vmx_capability.vpid);
}
@@ -1333,8 +1377,13 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
- if (!cpu_has_vmx_ept())
+ if (!cpu_has_vmx_ept()) {
enable_ept = 0;
+ enable_unrestricted_guest = 0;
+ }
+
+ if (!cpu_has_vmx_unrestricted_guest())
+ enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
@@ -1342,6 +1391,9 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_tpr_shadow())
kvm_x86_ops->update_cr8_intercept = NULL;
+ if (enable_ept && !cpu_has_vmx_ept_2m_page())
+ kvm_disable_largepages();
+
return alloc_kvm_area();
}
@@ -1372,15 +1424,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
- vcpu->arch.rmode.vm86_active = 0;
+ vmx->rmode.vm86_active = 0;
- vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
+ vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
+ vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
- flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
+ flags |= (vmx->rmode.save_iopl << IOPL_SHIFT);
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1391,10 +1443,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
if (emulate_invalid_guest_state)
return;
- fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+ fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
+ fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
+ fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
+ fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
vmcs_write16(GUEST_SS_SELECTOR, 0);
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1433,20 +1485,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (enable_unrestricted_guest)
+ return;
+
vmx->emulation_required = 1;
- vcpu->arch.rmode.vm86_active = 1;
+ vmx->rmode.vm86_active = 1;
- vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+ vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
- vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
- vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
- vcpu->arch.rmode.save_iopl
+ vmx->rmode.save_iopl
= (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1468,10 +1523,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CS_BASE, 0xf0000);
vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
- fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
continue_rmode:
kvm_mmu_reset_context(vcpu);
@@ -1545,11 +1600,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty))
+ return;
+
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
- printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
- return;
- }
vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
@@ -1557,6 +1612,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
}
}
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+ if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+ vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+ }
+
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -1571,8 +1641,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, vcpu->arch.cr4);
- *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
- *hw_cr0 &= ~X86_CR0_WP;
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1581,9 +1649,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, vcpu->arch.cr4);
- if (!(vcpu->arch.cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
}
+
+ if (!(cr0 & X86_CR0_WP))
+ *hw_cr0 &= ~X86_CR0_WP;
}
static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
@@ -1598,15 +1667,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
- KVM_VM_CR0_ALWAYS_ON;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr0;
+
+ if (enable_unrestricted_guest)
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
+ | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
vmx_fpu_deactivate(vcpu);
- if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
- if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
@@ -1650,10 +1725,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
- ept_sync_context(eptp);
- ept_load_pdptrs(vcpu);
guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ vcpu->kvm->arch.ept_identity_map_addr;
}
vmx_flush_tlb(vcpu);
@@ -1664,7 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
+ unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
@@ -1707,16 +1780,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
- struct kvm_segment kvm_seg;
-
if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
return 0;
if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
return 3;
- vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
- return kvm_seg.selector & 3;
+ return vmcs_read16(GUEST_CS_SELECTOR) & 3;
}
static u32 vmx_segment_access_rights(struct kvm_segment *var)
@@ -1744,20 +1814,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
- if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
- vcpu->arch.rmode.tr.selector = var->selector;
- vcpu->arch.rmode.tr.base = var->base;
- vcpu->arch.rmode.tr.limit = var->limit;
- vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
+ if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
+ vmx->rmode.tr.selector = var->selector;
+ vmx->rmode.tr.base = var->base;
+ vmx->rmode.tr.limit = var->limit;
+ vmx->rmode.tr.ar = vmx_segment_access_rights(var);
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
- if (vcpu->arch.rmode.vm86_active && var->s) {
+ if (vmx->rmode.vm86_active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
@@ -1766,6 +1837,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
ar = 0xf3;
} else
ar = vmx_segment_access_rights(var);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the usedland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ ar |= 0x1; /* Accessed */
+
vmcs_write32(sf->ar_bytes, ar);
}
@@ -2040,7 +2126,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
if (likely(kvm->arch.ept_identity_pagetable_done))
return 1;
ret = 0;
- identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
+ identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -2062,11 +2148,19 @@ out:
static void seg_setup(int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ unsigned int ar;
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
+ if (enable_unrestricted_guest) {
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
+ } else
+ ar = 0xf3;
+
+ vmcs_write32(sf->ar_bytes, ar);
}
static int alloc_apic_access_page(struct kvm *kvm)
@@ -2101,14 +2195,15 @@ static int alloc_identity_pagetable(struct kvm *kvm)
goto out;
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
- kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ kvm_userspace_mem.guest_phys_addr =
+ kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
if (r)
goto out;
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
- VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
+ kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
out:
up_write(&kvm->slots_lock);
return r;
@@ -2209,6 +2304,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept)
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -2326,14 +2423,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
goto out;
}
- vmx->vcpu.arch.rmode.vm86_active = 0;
+ vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (vmx->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
@@ -2344,7 +2441,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
*/
- if (vmx->vcpu.vcpu_id == 0) {
+ if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0x000f0000);
} else {
@@ -2373,7 +2470,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_SYSENTER_EIP, 0);
vmcs_writel(GUEST_RFLAGS, 0x02);
- if (vmx->vcpu.vcpu_id == 0)
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
kvm_rip_write(vcpu, 0xfff0);
else
kvm_rip_write(vcpu, 0);
@@ -2461,13 +2558,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
- KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
+ trace_kvm_inj_virq(irq);
++vcpu->stat.irq_injections;
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ if (vcpu->arch.interrupt.soft)
+ vmx->rmode.irq.rip +=
+ vmx->vcpu.arch.event_exit_inst_len;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -2502,7 +2602,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
++vcpu->stat.nmi_injections;
- if (vcpu->arch.rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = NMI_VECTOR;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2659,14 +2759,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (enable_ept)
BUG();
cr2 = vmcs_readl(EXIT_QUALIFICATION);
- KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
- (u32)((u64)cr2 >> 32), handler);
+ trace_kvm_page_fault(cr2, error_code);
+
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
- if (vcpu->arch.rmode.vm86_active &&
+ if (vmx->rmode.vm86_active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) {
if (vcpu->arch.halt_request) {
@@ -2707,7 +2807,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
++vcpu->stat.irq_exits;
- KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
return 1;
}
@@ -2755,7 +2854,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification, val;
int cr;
int reg;
@@ -2764,21 +2863,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg),
- (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
- handler);
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr0(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 3:
- kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr3(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 4:
- kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
+ kvm_set_cr4(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
case 8: {
@@ -2800,23 +2897,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
vmx_fpu_activate(vcpu);
- KVMTRACE_0D(CLTS, vcpu, handler);
skip_emulated_instruction(vcpu);
return 1;
case 1: /*mov from cr*/
switch (cr) {
case 3:
kvm_register_write(vcpu, reg, vcpu->arch.cr3);
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg),
- (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
- handler);
+ trace_kvm_cr_read(cr, vcpu->arch.cr3);
skip_emulated_instruction(vcpu);
return 1;
case 8:
- kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
- KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg), handler);
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -2841,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
unsigned long val;
int dr, reg;
+ if (!kvm_require_cpl(vcpu, 0))
+ return 1;
dr = vmcs_readl(GUEST_DR7);
if (dr & DR7_GD) {
/*
@@ -2884,7 +2979,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
val = 0;
}
kvm_register_write(vcpu, reg, val);
- KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
} else {
val = vcpu->arch.regs[reg];
switch (dr) {
@@ -2917,7 +3011,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
break;
}
- KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
}
skip_emulated_instruction(vcpu);
return 1;
@@ -2939,8 +3032,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
- KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_read(ecx, data);
/* FIXME: handling of bits 32:63 of rax, rdx */
vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
@@ -2955,8 +3047,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
+ trace_kvm_msr_write(ecx, data);
if (vmx_set_msr(vcpu, ecx, data) != 0) {
kvm_inject_gp(vcpu, 0);
@@ -2983,7 +3074,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- KVMTRACE_0D(PEND_INTR, vcpu, handler);
++vcpu->stat.irq_window_exits;
/*
@@ -3049,7 +3139,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
printk(KERN_ERR
"Fail to handle apic access vmexit! Offset is 0x%lx\n",
offset);
- return -ENOTSUPP;
+ return -ENOEXEC;
}
return 1;
}
@@ -3118,7 +3208,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (exit_qualification & (1 << 6)) {
printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
- return -ENOTSUPP;
+ return -EINVAL;
}
gla_validity = (exit_qualification >> 7) & 0x3;
@@ -3130,14 +3220,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
(long unsigned int)exit_qualification);
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = 0;
- return -ENOTSUPP;
+ kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ return 0;
}
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ trace_kvm_page_fault(gpa, exit_qualification);
return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
}
+static u64 ept_rsvd_mask(u64 spte, int level)
+{
+ int i;
+ u64 mask = 0;
+
+ for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
+ mask |= (1ULL << i);
+
+ if (level > 2)
+ /* bits 7:3 reserved */
+ mask |= 0xf8;
+ else if (level == 2) {
+ if (spte & (1ULL << 7))
+ /* 2MB ref, bits 20:12 reserved */
+ mask |= 0x1ff000;
+ else
+ /* bits 6:3 reserved */
+ mask |= 0x78;
+ }
+
+ return mask;
+}
+
+static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
+ int level)
+{
+ printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
+
+ /* 010b (write-only) */
+ WARN_ON((spte & 0x7) == 0x2);
+
+ /* 110b (write/execute) */
+ WARN_ON((spte & 0x7) == 0x6);
+
+ /* 100b (execute-only) and value not supported by logical processor */
+ if (!cpu_has_vmx_ept_execute_only())
+ WARN_ON((spte & 0x7) == 0x4);
+
+ /* not 000b */
+ if ((spte & 0x7)) {
+ u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
+
+ if (rsvd_bits != 0) {
+ printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
+ __func__, rsvd_bits);
+ WARN_ON(1);
+ }
+
+ if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
+ u64 ept_mem_type = (spte & 0x38) >> 3;
+
+ if (ept_mem_type == 2 || ept_mem_type == 3 ||
+ ept_mem_type == 7) {
+ printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
+ __func__, ept_mem_type);
+ WARN_ON(1);
+ }
+ }
+ }
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ u64 sptes[4];
+ int nr_sptes, i;
+ gpa_t gpa;
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+
+ printk(KERN_ERR "EPT: Misconfiguration.\n");
+ printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
+
+ nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
+
+ for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
+ ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+
+ kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+ kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+
+ return 0;
+}
+
static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u32 cpu_based_vm_exec_control;
@@ -3217,8 +3391,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
- [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3234,8 +3409,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
- (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
+ trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
/* If we need to emulate an MMIO from handle_invalid_guest_state
* we just return 0 */
@@ -3247,10 +3421,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
- if (enable_ept && is_paging(vcpu)) {
+ if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- ept_load_pdptrs(vcpu);
- }
if (unlikely(vmx->fail)) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -3326,10 +3498,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
/* We need to handle NMIs before interrupts are enabled */
if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
- KVMTRACE_0D(NMI, &vmx->vcpu, handler);
+ (exit_intr_info & INTR_INFO_VALID_MASK))
asm("int $2");
- }
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3434,6 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (enable_ept && is_paging(vcpu)) {
+ vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
+ ept_load_pdptrs(vcpu);
+ }
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
vmx->entry_time = ktime_get();
@@ -3449,12 +3623,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
/*
* Loading guest fpu may have cleared host cr0.ts
*/
vmcs_writel(HOST_CR0, read_cr0());
- set_debugreg(vcpu->arch.dr6, 6);
+ if (vcpu->arch.switch_db_regs)
+ set_debugreg(vcpu->arch.dr6, 6);
asm(
/* Store host registers */
@@ -3465,11 +3648,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%"R"sp, %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
+ /* Reload cr2 if changed */
+ "mov %c[cr2](%0), %%"R"ax \n\t"
+ "mov %%cr2, %%"R"dx \n\t"
+ "cmp %%"R"ax, %%"R"dx \n\t"
+ "je 2f \n\t"
+ "mov %%"R"ax, %%cr2 \n\t"
+ "2: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
- "mov %c[cr2](%0), %%"R"ax \n\t"
- "mov %%"R"ax, %%cr2 \n\t"
"mov %c[rax](%0), %%"R"ax \n\t"
"mov %c[rbx](%0), %%"R"bx \n\t"
"mov %c[rdx](%0), %%"R"dx \n\t"
@@ -3547,10 +3735,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
#endif
);
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_PDPTR));
vcpu->arch.regs_dirty = 0;
- get_debugreg(vcpu->arch.dr6, 6);
+ if (vcpu->arch.switch_db_regs)
+ get_debugreg(vcpu->arch.dr6, 6);
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
@@ -3633,9 +3823,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (alloc_apic_access_page(kvm) != 0)
goto free_vmcs;
- if (enable_ept)
+ if (enable_ept) {
+ if (!kvm->arch.ept_identity_map_addr)
+ kvm->arch.ept_identity_map_addr =
+ VMX_EPT_IDENTITY_PAGETABLE_ADDR;
if (alloc_identity_pagetable(kvm) != 0)
goto free_vmcs;
+ }
return &vmx->vcpu;
@@ -3699,6 +3893,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return ret;
}
+static const struct trace_print_flags vmx_exit_reasons_str[] = {
+ { EXIT_REASON_EXCEPTION_NMI, "exception" },
+ { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" },
+ { EXIT_REASON_TRIPLE_FAULT, "triple_fault" },
+ { EXIT_REASON_NMI_WINDOW, "nmi_window" },
+ { EXIT_REASON_IO_INSTRUCTION, "io_instruction" },
+ { EXIT_REASON_CR_ACCESS, "cr_access" },
+ { EXIT_REASON_DR_ACCESS, "dr_access" },
+ { EXIT_REASON_CPUID, "cpuid" },
+ { EXIT_REASON_MSR_READ, "rdmsr" },
+ { EXIT_REASON_MSR_WRITE, "wrmsr" },
+ { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" },
+ { EXIT_REASON_HLT, "halt" },
+ { EXIT_REASON_INVLPG, "invlpg" },
+ { EXIT_REASON_VMCALL, "hypercall" },
+ { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" },
+ { EXIT_REASON_APIC_ACCESS, "apic_access" },
+ { EXIT_REASON_WBINVD, "wbinvd" },
+ { EXIT_REASON_TASK_SWITCH, "task_switch" },
+ { EXIT_REASON_EPT_VIOLATION, "ept_violation" },
+ { -1, NULL }
+};
+
+static bool vmx_gb_page_enable(void)
+{
+ return false;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -3758,6 +3980,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
+
+ .exit_reasons_str = vmx_exit_reasons_str,
+ .gb_page_enable = vmx_gb_page_enable,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 633ccc7400a..4fc80174191 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,17 @@
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
+#include <trace/events/kvm.h>
+#undef TRACE_INCLUDE_FILE
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+#include <asm/debugreg.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
+#include <asm/mce.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -55,6 +61,10 @@
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
@@ -68,14 +78,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
+int ignore_msrs = 0;
+module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -122,18 +134,16 @@ unsigned long segment_base(u16 selector)
if (selector == 0)
return 0;
- asm("sgdt %0" : "=m"(gdt));
+ kvm_get_gdt(&gdt);
table_base = gdt.base;
if (selector & 4) { /* from ldt */
- u16 ldt_selector;
+ u16 ldt_selector = kvm_read_ldt();
- asm("sldt %0" : "=g"(ldt_selector));
table_base = segment_base(ldt_selector);
}
d = (struct desc_struct *)(table_base + (selector & ~7));
- v = d->base0 | ((unsigned long)d->base1 << 16) |
- ((unsigned long)d->base2 << 24);
+ v = get_desc_base(d);
#ifdef CONFIG_X86_64
if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
@@ -176,16 +186,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
++vcpu->stat.pf_guest;
if (vcpu->arch.exception.pending) {
- if (vcpu->arch.exception.nr == PF_VECTOR) {
- printk(KERN_DEBUG "kvm: inject_page_fault:"
- " double fault 0x%lx\n", addr);
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+ switch(vcpu->arch.exception.nr) {
+ case DF_VECTOR:
/* triple fault -> shutdown */
set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return;
+ case PF_VECTOR:
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ return;
+ default:
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ vcpu->arch.exception.pending = false;
+ break;
}
- return;
}
vcpu->arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@ -207,12 +223,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
-static void __queue_exception(struct kvm_vcpu *vcpu)
+/*
+ * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
+ * a #GP and return false.
+ */
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
+ if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+ return true;
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return false;
}
+EXPORT_SYMBOL_GPL(kvm_require_cpl);
/*
* Load the pae pdptrs. Return true is they are all valid.
@@ -232,7 +254,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_pte(pdpte[i]) &&
+ if (is_present_gpte(pdpte[i]) &&
(pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
@@ -241,6 +263,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
ret = 1;
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
out:
return ret;
@@ -256,6 +282,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ return true;
+
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
@@ -328,9 +358,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
- KVMTRACE_1D(LMSW, vcpu,
- (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
- handler);
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
@@ -466,7 +493,7 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
@@ -644,8 +671,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
- &vcpu->hv_clock.tsc_timestamp);
+ kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
ktime_get_ts(&ts);
local_irq_restore(flags);
@@ -778,23 +804,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ /* only 0 or all 1s can be written to IA32_MCi_CTL */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case MSR_EFER:
set_efer(vcpu, data);
break;
- case MSR_IA32_MC0_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __func__, data);
+ case MSR_K7_HWCR:
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ if (data != 0) {
+ pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+ data);
+ return 1;
+ }
break;
- case MSR_IA32_MCG_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __func__, data);
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+ "0x%llx\n", data);
+ return 1;
+ }
break;
- case MSR_IA32_MCG_CTL:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
- __func__, data);
+ case MSR_AMD64_NB_CFG:
break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
@@ -811,12 +874,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
break;
case 0x200 ... 0x2ff:
return set_msr_mtrr(vcpu, msr, data);
case MSR_IA32_APICBASE:
kvm_set_apic_base(vcpu, data);
break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
@@ -850,9 +916,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_request_guest_time_update(vcpu);
break;
}
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return set_msr_mce(vcpu, msr, data);
+
+ /* Performance counters are not protected by a CPUID bit,
+ * so we should check all of them in the generic path for the sake of
+ * cross vendor migration.
+ * Writing a zero into the event select MSRs disables them,
+ * which we perfectly emulate ;-). Any other value should be at least
+ * reported, some guests depend on them.
+ */
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ if (data != 0)
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ /* at least RHEL 4 unconditionally writes to the perfctr registers,
+ * so we ignore writes to make it happy.
+ */
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
default:
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
- return 1;
+ if (!ignore_msrs) {
+ pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+ msr, data);
+ return 1;
+ } else {
+ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+ msr, data);
+ break;
+ }
}
return 0;
}
@@ -905,26 +1012,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
switch (msr) {
- case 0xc0010010: /* SYSCFG */
- case 0xc0010015: /* HWCR */
- case MSR_IA32_PLATFORM_ID:
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MC0_CTL:
- case MSR_IA32_MCG_STATUS:
+ data = 0;
+ break;
case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
case MSR_IA32_MCG_CTL:
- case MSR_IA32_MC0_MISC:
- case MSR_IA32_MC0_MISC+4:
- case MSR_IA32_MC0_MISC+8:
- case MSR_IA32_MC0_MISC+12:
- case MSR_IA32_MC0_MISC+16:
- case MSR_IA32_MC0_MISC+20:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data;
+
+ switch (msr) {
+ case MSR_IA32_PLATFORM_ID:
case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
@@ -932,10 +1060,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
+ case MSR_K8_SYSCFG:
+ case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
case MSR_P6_EVNTSEL0:
case MSR_P6_EVNTSEL1:
case MSR_K7_EVNTSEL0:
+ case MSR_K7_PERFCTR0:
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
data = 0;
break;
case MSR_MTRRcap:
@@ -949,6 +1085,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_APICBASE:
data = kvm_get_apic_base(vcpu);
break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ return kvm_x2apic_msr_read(vcpu, msr, pdata);
+ break;
case MSR_IA32_MISC_ENABLE:
data = vcpu->arch.ia32_misc_enable_msr;
break;
@@ -967,9 +1106,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_SYSTEM_TIME:
data = vcpu->arch.time;
break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return get_msr_mce(vcpu, msr, pdata);
default:
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
- return 1;
+ if (!ignore_msrs) {
+ pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ } else {
+ pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+ data = 0;
+ }
+ break;
}
*pdata = data;
return 0;
@@ -1068,6 +1220,11 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
+ case KVM_CAP_IRQFD:
+ case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_SET_IDENTITY_MAP_ADDR:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1088,6 +1245,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IOMMU:
r = iommu_found();
break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
default:
r = 0;
break;
@@ -1147,6 +1307,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ u64 mce_cap;
+
+ mce_cap = KVM_MCE_CAP_SUPPORTED;
+ r = -EFAULT;
+ if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ goto out;
+ r = 0;
+ break;
+ }
default:
r = -EINVAL;
}
@@ -1227,6 +1397,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
vcpu->arch.cpuid_nent = cpuid->nent;
cpuid_fix_nx_cap(vcpu);
r = 0;
+ kvm_apic_set_version(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1248,6 +1419,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
vcpu->arch.cpuid_nent = cpuid->nent;
+ kvm_apic_set_version(vcpu);
return 0;
out:
@@ -1290,6 +1462,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+ unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
#ifdef CONFIG_X86_64
unsigned f_lm = F(LM);
#else
@@ -1314,7 +1487,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* Reserved */ |
f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
@@ -1323,7 +1496,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
- F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+ F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved, XSAVE, OSXSAVE */;
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
@@ -1344,6 +1517,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
case 1:
entry->edx &= kvm_supported_word0_x86_features;
entry->ecx &= kvm_supported_word4_x86_features;
+ /* we support x2apic emulation even if host does not support
+ * it since we emulate x2apic in software */
+ entry->ecx |= F(X2APIC);
break;
/* function 2 entries are STATEFUL. That is, repeated cpuid commands
* may return different values. This forces us to get_cpu() before
@@ -1416,6 +1592,8 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
if (cpuid->nent < 1)
goto out;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
r = -ENOMEM;
cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
if (!cpuid_entries)
@@ -1435,6 +1613,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
do_cpuid_ent(&cpuid_entries[nent], func, 0,
&nent, cpuid->nent);
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
r = -EFAULT;
if (copy_to_user(entries, cpuid_entries,
nent * sizeof(struct kvm_cpuid_entry2)))
@@ -1464,6 +1646,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
vcpu_load(vcpu);
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
+ update_cr8_intercept(vcpu);
vcpu_put(vcpu);
return 0;
@@ -1503,6 +1686,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
+ goto out;
+ if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s */
+ for (bank = 0; bank < bank_num; bank++)
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ banks += 4 * mce->bank;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+ printk(KERN_DEBUG "kvm: set_mce: "
+ "injects mce exception while "
+ "previous one is in progress!\n");
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -1636,6 +1893,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
break;
}
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof mce))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -1654,6 +1929,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
return ret;
}
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+ u64 ident_addr)
+{
+ kvm->arch.ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
u32 kvm_nr_mmu_pages)
{
@@ -1775,19 +2057,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
+ spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
+ spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
+ mutex_lock(&kvm->irq_lock);
memcpy(ioapic_irqchip(kvm),
&chip->chip.ioapic,
sizeof(struct kvm_ioapic_state));
+ mutex_unlock(&kvm->irq_lock);
break;
default:
r = -EINVAL;
@@ -1801,7 +2089,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int r = 0;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
}
@@ -1809,8 +2099,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int r = 0;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+ kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int r = 0;
+
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int r = 0, start = 0;
+ u32 prev_legacy, cur_legacy;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
+ sizeof(kvm->arch.vpit->pit_state.channels));
+ kvm->arch.vpit->pit_state.flags = ps->flags;
+ kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
}
@@ -1819,7 +2140,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
{
if (!kvm->arch.vpit)
return -ENXIO;
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -1845,7 +2168,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
memslot = &kvm->memslots[log->slot];
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
memset(memslot->dirty_bitmap, 0, n);
@@ -1869,7 +2191,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
*/
union {
struct kvm_pit_state ps;
+ struct kvm_pit_state2 ps2;
struct kvm_memory_alias alias;
+ struct kvm_pit_config pit_config;
} u;
switch (ioctl) {
@@ -1878,6 +2202,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (r < 0)
goto out;
break;
+ case KVM_SET_IDENTITY_MAP_ADDR: {
+ u64 ident_addr;
+
+ r = -EFAULT;
+ if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+ goto out;
+ r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+ if (r < 0)
+ goto out;
+ break;
+ }
case KVM_SET_MEMORY_REGION: {
struct kvm_memory_region kvm_mem;
struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -1930,16 +2265,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
break;
case KVM_CREATE_PIT:
- mutex_lock(&kvm->lock);
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
+ down_write(&kvm->slots_lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
r = -ENOMEM;
- kvm->arch.vpit = kvm_create_pit(kvm);
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- mutex_unlock(&kvm->lock);
+ up_write(&kvm->slots_lock);
break;
case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
@@ -1950,10 +2293,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
if (irqchip_in_kernel(kvm)) {
__s32 status;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->irq_lock);
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->irq_lock);
if (ioctl == KVM_IRQ_LINE_STATUS) {
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
@@ -2042,6 +2385,32 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_GET_PIT2: {
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT2: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
case KVM_REINJECT_CONTROL: {
struct kvm_reinject_control control;
r = -EFAULT;
@@ -2075,35 +2444,23 @@ static void kvm_init_msr_list(void)
num_msrs_to_save = j;
}
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+ const void *v)
{
- struct kvm_io_device *dev;
+ if (vcpu->arch.apic &&
+ !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+ return 0;
- if (vcpu->arch.apic) {
- dev = &vcpu->arch.apic->dev;
- if (dev->in_range(dev, addr, len, is_write))
- return dev;
- }
- return NULL;
+ return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
}
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
{
- struct kvm_io_device *dev;
+ if (vcpu->arch.apic &&
+ !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+ return 0;
- dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
- if (dev == NULL)
- dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
- is_write);
- return dev;
+ return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
}
static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
@@ -2172,11 +2529,12 @@ static int emulator_read_emulated(unsigned long addr,
unsigned int bytes,
struct kvm_vcpu *vcpu)
{
- struct kvm_io_device *mmio_dev;
gpa_t gpa;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+ vcpu->mmio_phys_addr, *(u64 *)val);
vcpu->mmio_read_completed = 0;
return X86EMUL_CONTINUE;
}
@@ -2197,14 +2555,12 @@ mmio:
/*
* Is this MMIO handled locally?
*/
- mutex_lock(&vcpu->kvm->lock);
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
- if (mmio_dev) {
- kvm_iodevice_read(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
+ if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
return X86EMUL_CONTINUE;
}
- mutex_unlock(&vcpu->kvm->lock);
+
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -2231,7 +2587,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
unsigned int bytes,
struct kvm_vcpu *vcpu)
{
- struct kvm_io_device *mmio_dev;
gpa_t gpa;
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
@@ -2249,17 +2604,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
return X86EMUL_CONTINUE;
mmio:
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
/*
* Is this MMIO handled locally?
*/
- mutex_lock(&vcpu->kvm->lock);
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
- if (mmio_dev) {
- kvm_iodevice_write(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
+ if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
return X86EMUL_CONTINUE;
- }
- mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -2343,7 +2693,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
int emulate_clts(struct kvm_vcpu *vcpu)
{
- KVMTRACE_0D(CLTS, vcpu, handler);
kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
return X86EMUL_CONTINUE;
}
@@ -2420,7 +2769,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
/*
- * TODO: fix x86_emulate.c to use guest_read/write_register
+ * TODO: fix emulate.c to use guest_read/write_register
* instead of direct ->regs accesses, can save hundred cycles
* on Intel for instructions that don't read/change RSP, for
* for example.
@@ -2444,14 +2793,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
- /* Reject the instructions other than VMCALL/VMMCALL when
- * try to emulate invalid opcode */
+ /* Only allow emulation of specific instructions on #UD
+ * (namely VMMCALL, sysenter, sysexit, syscall)*/
c = &vcpu->arch.emulate_ctxt.decode;
- if ((emulation_type & EMULTYPE_TRAP_UD) &&
- (!(c->twobyte && c->b == 0x01 &&
- (c->modrm_reg == 0 || c->modrm_reg == 3) &&
- c->modrm_mod == 3 && c->modrm_rm == 1)))
- return EMULATE_FAIL;
+ if (emulation_type & EMULTYPE_TRAP_UD) {
+ if (!c->twobyte)
+ return EMULATE_FAIL;
+ switch (c->b) {
+ case 0x01: /* VMMCALL */
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ return EMULATE_FAIL;
+ break;
+ case 0x34: /* sysenter */
+ case 0x35: /* sysexit */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ case 0x05: /* syscall */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ default:
+ return EMULATE_FAIL;
+ }
+
+ if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+ return EMULATE_FAIL;
+ }
++vcpu->stat.insn_emulation;
if (r) {
@@ -2571,52 +2939,40 @@ int complete_pio(struct kvm_vcpu *vcpu)
return 0;
}
-static void kernel_pio(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu,
- void *pd)
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{
/* TODO: String I/O for in kernel device */
+ int r;
- mutex_lock(&vcpu->kvm->lock);
if (vcpu->arch.pio.in)
- kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
- vcpu->arch.pio.size,
- pd);
+ r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
else
- kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
- vcpu->arch.pio.size,
- pd);
- mutex_unlock(&vcpu->kvm->lock);
+ r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ return r;
}
-static void pio_string_write(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu)
+static int pio_string_write(struct kvm_vcpu *vcpu)
{
struct kvm_pio_request *io = &vcpu->arch.pio;
void *pd = vcpu->arch.pio_data;
- int i;
+ int i, r = 0;
- mutex_lock(&vcpu->kvm->lock);
for (i = 0; i < io->cur_count; i++) {
- kvm_iodevice_write(pio_dev, io->port,
- io->size,
- pd);
+ if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+ io->port, io->size, pd)) {
+ r = -EOPNOTSUPP;
+ break;
+ }
pd += io->size;
}
- mutex_unlock(&vcpu->kvm->lock);
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
-{
- return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
+ return r;
}
int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned port)
{
- struct kvm_io_device *pio_dev;
unsigned long val;
vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2630,19 +2986,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.down = 0;
vcpu->arch.pio.rep = 0;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+ size, 1);
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &val, 4);
- pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
- if (pio_dev) {
- kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
complete_pio(vcpu);
return 1;
}
@@ -2656,7 +3006,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
{
unsigned now, in_page;
int ret = 0;
- struct kvm_io_device *pio_dev;
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2669,12 +3018,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.down = down;
vcpu->arch.pio.rep = rep;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+ size, count);
if (!count) {
kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2704,9 +3049,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.guest_gva = address;
- pio_dev = vcpu_find_pio_dev(vcpu, port,
- vcpu->arch.pio.cur_count,
- !vcpu->arch.pio.in);
if (!vcpu->arch.pio.in) {
/* string PIO write */
ret = pio_copy_data(vcpu);
@@ -2714,16 +3056,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
kvm_inject_gp(vcpu, 0);
return 1;
}
- if (ret == 0 && pio_dev) {
- pio_string_write(pio_dev, vcpu);
+ if (ret == 0 && !pio_string_write(vcpu)) {
complete_pio(vcpu);
if (vcpu->arch.pio.count == 0)
ret = 1;
}
- } else if (pio_dev)
- pr_unimpl(vcpu, "no string pio read support yet, "
- "port %x size %d count %ld\n",
- port, size, count);
+ }
+ /* no string PIO read support yet */
return ret;
}
@@ -2756,10 +3095,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (!vcpu)
- continue;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
if (!kvm_request_guest_time_update(vcpu))
@@ -2852,7 +3188,6 @@ void kvm_arch_exit(void)
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
- KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
@@ -2883,7 +3218,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
- KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
if (!is_long_mode(vcpu)) {
nr &= 0xFFFFFFFF;
@@ -2893,6 +3228,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
+ if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+ ret = -KVM_EPERM;
+ goto out;
+ }
+
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
@@ -2904,6 +3244,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = -KVM_ENOSYS;
break;
}
+out:
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu->stat.hypercalls;
return r;
@@ -2983,8 +3324,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
return 0;
}
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
- (u32)((u64)value >> 32), handler);
return value;
}
@@ -2992,9 +3331,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
unsigned long *rflags)
{
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
- (u32)((u64)val >> 32), handler);
-
switch (cr) {
case 0:
kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -3104,11 +3440,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
}
kvm_x86_ops->skip_emulated_instruction(vcpu);
- KVMTRACE_5D(CPUID, vcpu, function,
- (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+ trace_kvm_cpuid(function,
+ kvm_register_read(vcpu, VCPU_REGS_RAX),
+ kvm_register_read(vcpu, VCPU_REGS_RBX),
+ kvm_register_read(vcpu, VCPU_REGS_RCX),
+ kvm_register_read(vcpu, VCPU_REGS_RDX));
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
@@ -3174,6 +3510,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
if (!kvm_x86_ops->update_cr8_intercept)
return;
+ if (!vcpu->arch.apic)
+ return;
+
if (!vcpu->arch.apic->vapic_addr)
max_irr = kvm_lapic_find_highest_irr(vcpu);
else
@@ -3187,12 +3526,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
-
/* try to reinject previous events if any */
+ if (vcpu->arch.exception.pending) {
+ kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
+ return;
+ }
+
if (vcpu->arch.nmi_injected) {
kvm_x86_ops->set_nmi(vcpu);
return;
@@ -3266,16 +3609,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
smp_mb__after_clear_bit();
if (vcpu->requests || need_resched() || signal_pending(current)) {
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
- if (vcpu->arch.exception.pending)
- __queue_exception(vcpu);
- else
- inject_pending_irq(vcpu, kvm_run);
+ inject_pending_event(vcpu, kvm_run);
/* enable NMI/IRQ window open exits if needed */
if (vcpu->arch.nmi_pending)
@@ -3292,14 +3633,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_guest_enter();
- get_debugreg(vcpu->arch.host_dr6, 6);
- get_debugreg(vcpu->arch.host_dr7, 7);
if (unlikely(vcpu->arch.switch_db_regs)) {
- get_debugreg(vcpu->arch.host_db[0], 0);
- get_debugreg(vcpu->arch.host_db[1], 1);
- get_debugreg(vcpu->arch.host_db[2], 2);
- get_debugreg(vcpu->arch.host_db[3], 3);
-
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -3307,18 +3641,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
set_debugreg(vcpu->arch.eff_db[3], 3);
}
- KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+ trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu, kvm_run);
- if (unlikely(vcpu->arch.switch_db_regs)) {
- set_debugreg(0, 7);
- set_debugreg(vcpu->arch.host_db[0], 0);
- set_debugreg(vcpu->arch.host_db[1], 1);
- set_debugreg(vcpu->arch.host_db[2], 2);
- set_debugreg(vcpu->arch.host_db[3], 3);
- }
- set_debugreg(vcpu->arch.host_dr6, 6);
- set_debugreg(vcpu->arch.host_dr7, 7);
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
@@ -3648,11 +3982,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu,
static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
struct kvm_segment *kvm_desct)
{
- kvm_desct->base = seg_desc->base0;
- kvm_desct->base |= seg_desc->base1 << 16;
- kvm_desct->base |= seg_desc->base2 << 24;
- kvm_desct->limit = seg_desc->limit0;
- kvm_desct->limit |= seg_desc->limit << 16;
+ kvm_desct->base = get_desc_base(seg_desc);
+ kvm_desct->limit = get_desc_limit(seg_desc);
if (seg_desc->g) {
kvm_desct->limit <<= 12;
kvm_desct->limit |= 0xfff;
@@ -3696,7 +4027,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct desc_struct *seg_desc)
{
- gpa_t gpa;
struct descriptor_table dtable;
u16 index = selector >> 3;
@@ -3706,16 +4036,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
return 1;
}
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
- gpa += index * 8;
- return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
+ return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
}
/* allowed just for 8 bytes segments */
static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct desc_struct *seg_desc)
{
- gpa_t gpa;
struct descriptor_table dtable;
u16 index = selector >> 3;
@@ -3723,19 +4050,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
if (dtable.limit < index * 8 + 7)
return 1;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
- gpa += index * 8;
- return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
+ return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
}
-static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
+static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
struct desc_struct *seg_desc)
{
- u32 base_addr;
-
- base_addr = seg_desc->base0;
- base_addr |= (seg_desc->base1 << 16);
- base_addr |= (seg_desc->base2 << 24);
+ u32 base_addr = get_desc_base(seg_desc);
return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
}
@@ -3780,12 +4101,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
return 0;
}
+static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+ return (seg != VCPU_SREG_LDTR) &&
+ (seg != VCPU_SREG_TR) &&
+ (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM);
+}
+
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
int type_bits, int seg)
{
struct kvm_segment kvm_seg;
- if (!(vcpu->arch.cr0 & X86_CR0_PE))
+ if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
return kvm_load_realmode_segment(vcpu, selector, seg);
if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
return 1;
@@ -4024,7 +4352,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
}
}
- if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
+ if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
return 1;
}
@@ -4094,13 +4422,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-
- down_read(&vcpu->kvm->slots_lock);
- if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
- vcpu->arch.cr3 = sregs->cr3;
- else
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
- up_read(&vcpu->kvm->slots_lock);
+ vcpu->arch.cr3 = sregs->cr3;
kvm_set_cr8(vcpu, sregs->cr8);
@@ -4142,8 +4464,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+ update_cr8_intercept(vcpu);
+
/* Older userspace won't unhalt the vcpu on reset. */
- if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
!(vcpu->arch.cr0 & X86_CR0_PE))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4414,7 +4738,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm = vcpu->kvm;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+ if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -4436,6 +4760,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_mmu_destroy;
}
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL);
+ if (!vcpu->arch.mce_banks) {
+ r = -ENOMEM;
+ goto fail_mmu_destroy;
+ }
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
return 0;
fail_mmu_destroy:
@@ -4483,20 +4815,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
static void kvm_free_vcpus(struct kvm *kvm)
{
unsigned int i;
+ struct kvm_vcpu *vcpu;
/*
* Unpin any mmu pages first.
*/
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm_unload_vcpu_mmu(kvm->vcpus[i]);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_arch_vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_unload_vcpu_mmu(vcpu);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
}
void kvm_arch_sync_events(struct kvm *kvm)
@@ -4573,7 +4907,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
return 0;
}
@@ -4587,8 +4920,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
- || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
- || vcpu->arch.nmi_pending;
+ || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+ || vcpu->arch.nmi_pending ||
+ (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_has_interrupt(vcpu));
}
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -4612,3 +4947,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
return kvm_x86_ops->interrupt_allowed(vcpu);
}
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 4c8e10af78e..5eadea585d2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
{
return (nr == BP_VECTOR) || (nr == OF_VECTOR);
}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index);
+
#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d677fa9ca65..7e59dc1d3fc 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1135,11 +1135,6 @@ static struct notifier_block paniced = {
/* Setting up memory is fairly easy. */
static __init char *lguest_memory_setup(void)
{
- /* We do this here and not earlier because lockcheck used to barf if we
- * did it before start_kernel(). I think we fixed that, so it'd be
- * nice to move it back to lguest_init. Patch welcome... */
- atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
/*
*The Linux bootloader header contains an "e820" memory map: the
* Launcher populated the first entry with our memory limit.
@@ -1262,7 +1257,6 @@ __init void lguest_init(void)
*/
/* Interrupt-related operations */
- pv_irq_ops.init_IRQ = lguest_init_IRQ;
pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
@@ -1270,7 +1264,6 @@ __init void lguest_init(void)
pv_irq_ops.safe_halt = lguest_safe_halt;
/* Setup operations */
- pv_init_ops.memory_setup = lguest_memory_setup;
pv_init_ops.patch = lguest_patch;
/* Intercepts of various CPU instructions */
@@ -1320,10 +1313,11 @@ __init void lguest_init(void)
set_lguest_basic_apic_ops();
#endif
- /* Time operations */
- pv_time_ops.get_wallclock = lguest_get_wallclock;
- pv_time_ops.time_init = lguest_time_init;
- pv_time_ops.get_tsc_khz = lguest_tsc_khz;
+ x86_init.resources.memory_setup = lguest_memory_setup;
+ x86_init.irqs.intr_init = lguest_init_IRQ;
+ x86_init.timers.timer_init = lguest_time_init;
+ x86_platform.calibrate_tsc = lguest_tsc_khz;
+ x86_platform.get_wallclock = lguest_get_wallclock;
/*
* Now is a good time to look at the implementations of these functions
@@ -1365,10 +1359,13 @@ __init void lguest_init(void)
/*
* If we don't initialize the lock dependency checker now, it crashes
- * paravirt_disable_iospace.
+ * atomic_notifier_chain_register, then paravirt_disable_iospace.
*/
lockdep_init();
+ /* Hook in our special panic hypercall code. */
+ atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
/*
* The IDE code spends about 3 seconds probing for disks: if we reserve
* all the I/O ports up front it can't get them and so doesn't probe.
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 00000000000..8df89f0a3fe
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1 @@
+inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 9e609206fac..a2d6472895f 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,12 +2,25 @@
# Makefile for x86 specific library files.
#
+inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
+quiet_cmd_inat_tables = GEN $@
+ cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@
+
+$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
+ $(call cmd,inat_tables)
+
+$(obj)/inat.o: $(obj)/inat-tables.c
+
+clean-files := inat-tables.c
+
obj-$(CONFIG_SMP) := msr.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
+lib-y += insn.o inat.o
obj-y += msr-reg.o msr-reg-export.o
@@ -16,7 +29,9 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += semaphore_32.o string_32.o
-
+ifneq ($(CONFIG_X86_CMPXCHG64),y)
+ lib-y += cmpxchg8b_emu.o
+endif
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
obj-y += io_64.o iomap_copy_64.o
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
new file mode 100644
index 00000000000..828cb710dec
--- /dev/null
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -0,0 +1,57 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+
+.text
+
+/*
+ * Inputs:
+ * %esi : memory location to compare
+ * %eax : low 32 bits of old value
+ * %edx : high 32 bits of old value
+ * %ebx : low 32 bits of new value
+ * %ecx : high 32 bits of new value
+ */
+ENTRY(cmpxchg8b_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg8b (%esi)' on UP except we don't
+# set the whole ZF thing (caller will just compare
+# eax:edx with the expected value)
+#
+cmpxchg8b_emu:
+ pushfl
+ cli
+
+ cmpl (%esi), %eax
+ jne not_same
+ cmpl 4(%esi), %edx
+ jne half_same
+
+ movl %ebx, (%esi)
+ movl %ecx, 4(%esi)
+
+ popfl
+ ret
+
+ not_same:
+ movl (%esi), %eax
+ half_same:
+ movl 4(%esi), %edx
+
+ popfl
+ ret
+
+CFI_ENDPROC
+ENDPROC(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 6ba0f7bb85e..cf889d4e076 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -65,7 +65,7 @@
.endm
/* Standard copy_to_user with segment limit checking */
-ENTRY(copy_to_user)
+ENTRY(_copy_to_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rdi,%rcx
@@ -75,10 +75,10 @@ ENTRY(copy_to_user)
jae bad_to_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
CFI_ENDPROC
-ENDPROC(copy_to_user)
+ENDPROC(_copy_to_user)
/* Standard copy_from_user with segment limit checking */
-ENTRY(copy_from_user)
+ENTRY(_copy_from_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rsi,%rcx
@@ -88,7 +88,7 @@ ENTRY(copy_from_user)
jae bad_from_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
CFI_ENDPROC
-ENDPROC(copy_from_user)
+ENDPROC(_copy_from_user)
ENTRY(copy_user_generic)
CFI_STARTPROC
@@ -96,12 +96,6 @@ ENTRY(copy_user_generic)
CFI_ENDPROC
ENDPROC(copy_user_generic)
-ENTRY(__copy_from_user_inatomic)
- CFI_STARTPROC
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-ENDPROC(__copy_from_user_inatomic)
-
.section .fixup,"ax"
/* must zero dest */
ENTRY(bad_from_user)
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 00000000000..46fc4ee09fc
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,90 @@
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/insn.h>
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+ return inat_primary_table[opcode];
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
+ insn_attr_t esc_attr)
+{
+ const insn_attr_t *table;
+ insn_attr_t lpfx_attr;
+ int n, m = 0;
+
+ n = inat_escape_id(esc_attr);
+ if (last_pfx) {
+ lpfx_attr = inat_get_opcode_attribute(last_pfx);
+ m = inat_last_prefix_id(lpfx_attr);
+ }
+ table = inat_escape_tables[n][0];
+ if (!table)
+ return 0;
+ if (inat_has_variant(table[opcode]) && m) {
+ table = inat_escape_tables[n][m];
+ if (!table)
+ return 0;
+ }
+ return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
+ insn_attr_t grp_attr)
+{
+ const insn_attr_t *table;
+ insn_attr_t lpfx_attr;
+ int n, m = 0;
+
+ n = inat_group_id(grp_attr);
+ if (last_pfx) {
+ lpfx_attr = inat_get_opcode_attribute(last_pfx);
+ m = inat_last_prefix_id(lpfx_attr);
+ }
+ table = inat_group_tables[n][0];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
+ table = inat_group_tables[n][m];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ }
+ return table[X86_MODRM_REG(modrm)] |
+ inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+ insn_byte_t vex_p)
+{
+ const insn_attr_t *table;
+ if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+ return 0;
+ table = inat_avx_tables[vex_m][vex_p];
+ if (!table)
+ return 0;
+ return table[opcode];
+}
+
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644
index 00000000000..9f33b984d0e
--- /dev/null
+++ b/arch/x86/lib/insn.c
@@ -0,0 +1,516 @@
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004, 2009
+ */
+
+#include <linux/string.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+
+#define get_next(t, insn) \
+ ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define peek_next(t, insn) \
+ ({t r; r = *(t*)insn->next_byte; r; })
+
+#define peek_nbyte_next(t, insn, n) \
+ ({t r; r = *(t*)((insn)->next_byte + n); r; })
+
+/**
+ * insn_init() - initialize struct insn
+ * @insn: &struct insn to be initialized
+ * @kaddr: address (in kernel memory) of instruction (or copy thereof)
+ * @x86_64: !0 for 64-bit kernel or 64-bit app
+ */
+void insn_init(struct insn *insn, const void *kaddr, int x86_64)
+{
+ memset(insn, 0, sizeof(*insn));
+ insn->kaddr = kaddr;
+ insn->next_byte = kaddr;
+ insn->x86_64 = x86_64 ? 1 : 0;
+ insn->opnd_bytes = 4;
+ if (x86_64)
+ insn->addr_bytes = 8;
+ else
+ insn->addr_bytes = 4;
+}
+
+/**
+ * insn_get_prefixes - scan x86 instruction prefix bytes
+ * @insn: &struct insn containing instruction
+ *
+ * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
+ * to point to the (first) opcode. No effect if @insn->prefixes.got
+ * is already set.
+ */
+void insn_get_prefixes(struct insn *insn)
+{
+ struct insn_field *prefixes = &insn->prefixes;
+ insn_attr_t attr;
+ insn_byte_t b, lb;
+ int i, nb;
+
+ if (prefixes->got)
+ return;
+
+ nb = 0;
+ lb = 0;
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ while (inat_is_legacy_prefix(attr)) {
+ /* Skip if same prefix */
+ for (i = 0; i < nb; i++)
+ if (prefixes->bytes[i] == b)
+ goto found;
+ if (nb == 4)
+ /* Invalid instruction */
+ break;
+ prefixes->bytes[nb++] = b;
+ if (inat_is_address_size_prefix(attr)) {
+ /* address size switches 2/4 or 4/8 */
+ if (insn->x86_64)
+ insn->addr_bytes ^= 12;
+ else
+ insn->addr_bytes ^= 6;
+ } else if (inat_is_operand_size_prefix(attr)) {
+ /* oprand size switches 2/4 */
+ insn->opnd_bytes ^= 6;
+ }
+found:
+ prefixes->nbytes++;
+ insn->next_byte++;
+ lb = b;
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ }
+ /* Set the last prefix */
+ if (lb && lb != insn->prefixes.bytes[3]) {
+ if (unlikely(insn->prefixes.bytes[3])) {
+ /* Swap the last prefix */
+ b = insn->prefixes.bytes[3];
+ for (i = 0; i < nb; i++)
+ if (prefixes->bytes[i] == lb)
+ prefixes->bytes[i] = b;
+ }
+ insn->prefixes.bytes[3] = lb;
+ }
+
+ /* Decode REX prefix */
+ if (insn->x86_64) {
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ if (inat_is_rex_prefix(attr)) {
+ insn->rex_prefix.value = b;
+ insn->rex_prefix.nbytes = 1;
+ insn->next_byte++;
+ if (X86_REX_W(b))
+ /* REX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ }
+ }
+ insn->rex_prefix.got = 1;
+
+ /* Decode VEX prefix */
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ if (inat_is_vex_prefix(attr)) {
+ insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
+ if (!insn->x86_64) {
+ /*
+ * In 32-bits mode, if the [7:6] bits (mod bits of
+ * ModRM) on the second byte are not 11b, it is
+ * LDS or LES.
+ */
+ if (X86_MODRM_MOD(b2) != 3)
+ goto vex_end;
+ }
+ insn->vex_prefix.bytes[0] = b;
+ insn->vex_prefix.bytes[1] = b2;
+ if (inat_is_vex3_prefix(attr)) {
+ b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+ insn->vex_prefix.bytes[2] = b2;
+ insn->vex_prefix.nbytes = 3;
+ insn->next_byte += 3;
+ if (insn->x86_64 && X86_VEX_W(b2))
+ /* VEX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ } else {
+ insn->vex_prefix.nbytes = 2;
+ insn->next_byte += 2;
+ }
+ }
+vex_end:
+ insn->vex_prefix.got = 1;
+
+ prefixes->got = 1;
+ return;
+}
+
+/**
+ * insn_get_opcode - collect opcode(s)
+ * @insn: &struct insn containing instruction
+ *
+ * Populates @insn->opcode, updates @insn->next_byte to point past the
+ * opcode byte(s), and set @insn->attr (except for groups).
+ * If necessary, first collects any preceding (prefix) bytes.
+ * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
+ * is already 1.
+ */
+void insn_get_opcode(struct insn *insn)
+{
+ struct insn_field *opcode = &insn->opcode;
+ insn_byte_t op, pfx;
+ if (opcode->got)
+ return;
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+
+ /* Get first opcode */
+ op = get_next(insn_byte_t, insn);
+ opcode->bytes[0] = op;
+ opcode->nbytes = 1;
+
+ /* Check if there is VEX prefix or not */
+ if (insn_is_avx(insn)) {
+ insn_byte_t m, p;
+ m = insn_vex_m_bits(insn);
+ p = insn_vex_p_bits(insn);
+ insn->attr = inat_get_avx_attribute(op, m, p);
+ if (!inat_accept_vex(insn->attr))
+ insn->attr = 0; /* This instruction is bad */
+ goto end; /* VEX has only 1 byte for opcode */
+ }
+
+ insn->attr = inat_get_opcode_attribute(op);
+ while (inat_is_escape(insn->attr)) {
+ /* Get escaped opcode */
+ op = get_next(insn_byte_t, insn);
+ opcode->bytes[opcode->nbytes++] = op;
+ pfx = insn_last_prefix(insn);
+ insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
+ }
+ if (inat_must_vex(insn->attr))
+ insn->attr = 0; /* This instruction is bad */
+end:
+ opcode->got = 1;
+}
+
+/**
+ * insn_get_modrm - collect ModRM byte, if any
+ * @insn: &struct insn containing instruction
+ *
+ * Populates @insn->modrm and updates @insn->next_byte to point past the
+ * ModRM byte, if any. If necessary, first collects the preceding bytes
+ * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
+ */
+void insn_get_modrm(struct insn *insn)
+{
+ struct insn_field *modrm = &insn->modrm;
+ insn_byte_t pfx, mod;
+ if (modrm->got)
+ return;
+ if (!insn->opcode.got)
+ insn_get_opcode(insn);
+
+ if (inat_has_modrm(insn->attr)) {
+ mod = get_next(insn_byte_t, insn);
+ modrm->value = mod;
+ modrm->nbytes = 1;
+ if (inat_is_group(insn->attr)) {
+ pfx = insn_last_prefix(insn);
+ insn->attr = inat_get_group_attribute(mod, pfx,
+ insn->attr);
+ }
+ }
+
+ if (insn->x86_64 && inat_is_force64(insn->attr))
+ insn->opnd_bytes = 8;
+ modrm->got = 1;
+}
+
+
+/**
+ * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte. No effect if @insn->x86_64 is 0.
+ */
+int insn_rip_relative(struct insn *insn)
+{
+ struct insn_field *modrm = &insn->modrm;
+
+ if (!insn->x86_64)
+ return 0;
+ if (!modrm->got)
+ insn_get_modrm(insn);
+ /*
+ * For rip-relative instructions, the mod field (top 2 bits)
+ * is zero and the r/m field (bottom 3 bits) is 0x5.
+ */
+ return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+}
+
+/**
+ * insn_get_sib() - Get the SIB byte of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.
+ */
+void insn_get_sib(struct insn *insn)
+{
+ insn_byte_t modrm;
+
+ if (insn->sib.got)
+ return;
+ if (!insn->modrm.got)
+ insn_get_modrm(insn);
+ if (insn->modrm.nbytes) {
+ modrm = (insn_byte_t)insn->modrm.value;
+ if (insn->addr_bytes != 2 &&
+ X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
+ insn->sib.value = get_next(insn_byte_t, insn);
+ insn->sib.nbytes = 1;
+ }
+ }
+ insn->sib.got = 1;
+}
+
+
+/**
+ * insn_get_displacement() - Get the displacement of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * SIB byte.
+ * Displacement value is sign-expanded.
+ */
+void insn_get_displacement(struct insn *insn)
+{
+ insn_byte_t mod, rm, base;
+
+ if (insn->displacement.got)
+ return;
+ if (!insn->sib.got)
+ insn_get_sib(insn);
+ if (insn->modrm.nbytes) {
+ /*
+ * Interpreting the modrm byte:
+ * mod = 00 - no displacement fields (exceptions below)
+ * mod = 01 - 1-byte displacement field
+ * mod = 10 - displacement field is 4 bytes, or 2 bytes if
+ * address size = 2 (0x67 prefix in 32-bit mode)
+ * mod = 11 - no memory operand
+ *
+ * If address size = 2...
+ * mod = 00, r/m = 110 - displacement field is 2 bytes
+ *
+ * If address size != 2...
+ * mod != 11, r/m = 100 - SIB byte exists
+ * mod = 00, SIB base = 101 - displacement field is 4 bytes
+ * mod = 00, r/m = 101 - rip-relative addressing, displacement
+ * field is 4 bytes
+ */
+ mod = X86_MODRM_MOD(insn->modrm.value);
+ rm = X86_MODRM_RM(insn->modrm.value);
+ base = X86_SIB_BASE(insn->sib.value);
+ if (mod == 3)
+ goto out;
+ if (mod == 1) {
+ insn->displacement.value = get_next(char, insn);
+ insn->displacement.nbytes = 1;
+ } else if (insn->addr_bytes == 2) {
+ if ((mod == 0 && rm == 6) || mod == 2) {
+ insn->displacement.value =
+ get_next(short, insn);
+ insn->displacement.nbytes = 2;
+ }
+ } else {
+ if ((mod == 0 && rm == 5) || mod == 2 ||
+ (mod == 0 && base == 5)) {
+ insn->displacement.value = get_next(int, insn);
+ insn->displacement.nbytes = 4;
+ }
+ }
+ }
+out:
+ insn->displacement.got = 1;
+}
+
+/* Decode moffset16/32/64 */
+static void __get_moffset(struct insn *insn)
+{
+ switch (insn->addr_bytes) {
+ case 2:
+ insn->moffset1.value = get_next(short, insn);
+ insn->moffset1.nbytes = 2;
+ break;
+ case 4:
+ insn->moffset1.value = get_next(int, insn);
+ insn->moffset1.nbytes = 4;
+ break;
+ case 8:
+ insn->moffset1.value = get_next(int, insn);
+ insn->moffset1.nbytes = 4;
+ insn->moffset2.value = get_next(int, insn);
+ insn->moffset2.nbytes = 4;
+ break;
+ }
+ insn->moffset1.got = insn->moffset2.got = 1;
+}
+
+/* Decode imm v32(Iz) */
+static void __get_immv32(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate.value = get_next(short, insn);
+ insn->immediate.nbytes = 2;
+ break;
+ case 4:
+ case 8:
+ insn->immediate.value = get_next(int, insn);
+ insn->immediate.nbytes = 4;
+ break;
+ }
+}
+
+/* Decode imm v64(Iv/Ov) */
+static void __get_immv(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate1.value = get_next(short, insn);
+ insn->immediate1.nbytes = 2;
+ break;
+ case 4:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ break;
+ case 8:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ insn->immediate2.value = get_next(int, insn);
+ insn->immediate2.nbytes = 4;
+ break;
+ }
+ insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/* Decode ptr16:16/32(Ap) */
+static void __get_immptr(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate1.value = get_next(short, insn);
+ insn->immediate1.nbytes = 2;
+ break;
+ case 4:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ break;
+ case 8:
+ /* ptr16:64 is not exist (no segment) */
+ return;
+ }
+ insn->immediate2.value = get_next(unsigned short, insn);
+ insn->immediate2.nbytes = 2;
+ insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/**
+ * insn_get_immediate() - Get the immediates of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * displacement bytes.
+ * Basically, most of immediates are sign-expanded. Unsigned-value can be
+ * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ */
+void insn_get_immediate(struct insn *insn)
+{
+ if (insn->immediate.got)
+ return;
+ if (!insn->displacement.got)
+ insn_get_displacement(insn);
+
+ if (inat_has_moffset(insn->attr)) {
+ __get_moffset(insn);
+ goto done;
+ }
+
+ if (!inat_has_immediate(insn->attr))
+ /* no immediates */
+ goto done;
+
+ switch (inat_immediate_size(insn->attr)) {
+ case INAT_IMM_BYTE:
+ insn->immediate.value = get_next(char, insn);
+ insn->immediate.nbytes = 1;
+ break;
+ case INAT_IMM_WORD:
+ insn->immediate.value = get_next(short, insn);
+ insn->immediate.nbytes = 2;
+ break;
+ case INAT_IMM_DWORD:
+ insn->immediate.value = get_next(int, insn);
+ insn->immediate.nbytes = 4;
+ break;
+ case INAT_IMM_QWORD:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ insn->immediate2.value = get_next(int, insn);
+ insn->immediate2.nbytes = 4;
+ break;
+ case INAT_IMM_PTR:
+ __get_immptr(insn);
+ break;
+ case INAT_IMM_VWORD32:
+ __get_immv32(insn);
+ break;
+ case INAT_IMM_VWORD:
+ __get_immv(insn);
+ break;
+ default:
+ break;
+ }
+ if (inat_has_second_immediate(insn->attr)) {
+ insn->immediate2.value = get_next(char, insn);
+ insn->immediate2.nbytes = 1;
+ }
+done:
+ insn->immediate.got = 1;
+}
+
+/**
+ * insn_get_length() - Get the length of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * immediates bytes.
+ */
+void insn_get_length(struct insn *insn)
+{
+ if (insn->length)
+ return;
+ if (!insn->immediate.got)
+ insn_get_immediate(insn);
+ insn->length = (unsigned char)((unsigned long)insn->next_byte
+ - (unsigned long)insn->kaddr);
+}
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f118d462ac..e218d5df85f 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user);
* data to the requested size using zero bytes.
*/
unsigned long
-copy_from_user(void *to, const void __user *from, unsigned long n)
+_copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (access_ok(VERIFY_READ, from, n))
n = __copy_from_user(to, from, n);
@@ -882,4 +882,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
memset(to, 0, n);
return n;
}
-EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(_copy_from_user);
+
+void copy_from_user_overflow(void)
+{
+ WARN(1, "Buffer overflow detected!\n");
+}
+EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644
index 00000000000..a793da5e560
--- /dev/null
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -0,0 +1,893 @@
+# x86 Opcode Maps
+#
+#<Opcode maps>
+# Table: table-name
+# Referrer: escaped-name
+# AVXcode: avx-code
+# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# (or)
+# opcode: escape # escaped-name
+# EndTable
+#
+#<group maps>
+# GrpTable: GrpXXX
+# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# EndTable
+#
+# AVX Superscripts
+# (VEX): this opcode can accept VEX prefix.
+# (oVEX): this opcode requires VEX prefix.
+# (o128): this opcode only supports 128bit VEX.
+# (o256): this opcode only supports 256bit VEX.
+#
+
+Table: one byte opcode
+Referrer:
+AVXcode:
+# 0x00 - 0x0f
+00: ADD Eb,Gb
+01: ADD Ev,Gv
+02: ADD Gb,Eb
+03: ADD Gv,Ev
+04: ADD AL,Ib
+05: ADD rAX,Iz
+06: PUSH ES (i64)
+07: POP ES (i64)
+08: OR Eb,Gb
+09: OR Ev,Gv
+0a: OR Gb,Eb
+0b: OR Gv,Ev
+0c: OR AL,Ib
+0d: OR rAX,Iz
+0e: PUSH CS (i64)
+0f: escape # 2-byte escape
+# 0x10 - 0x1f
+10: ADC Eb,Gb
+11: ADC Ev,Gv
+12: ADC Gb,Eb
+13: ADC Gv,Ev
+14: ADC AL,Ib
+15: ADC rAX,Iz
+16: PUSH SS (i64)
+17: POP SS (i64)
+18: SBB Eb,Gb
+19: SBB Ev,Gv
+1a: SBB Gb,Eb
+1b: SBB Gv,Ev
+1c: SBB AL,Ib
+1d: SBB rAX,Iz
+1e: PUSH DS (i64)
+1f: POP DS (i64)
+# 0x20 - 0x2f
+20: AND Eb,Gb
+21: AND Ev,Gv
+22: AND Gb,Eb
+23: AND Gv,Ev
+24: AND AL,Ib
+25: AND rAx,Iz
+26: SEG=ES (Prefix)
+27: DAA (i64)
+28: SUB Eb,Gb
+29: SUB Ev,Gv
+2a: SUB Gb,Eb
+2b: SUB Gv,Ev
+2c: SUB AL,Ib
+2d: SUB rAX,Iz
+2e: SEG=CS (Prefix)
+2f: DAS (i64)
+# 0x30 - 0x3f
+30: XOR Eb,Gb
+31: XOR Ev,Gv
+32: XOR Gb,Eb
+33: XOR Gv,Ev
+34: XOR AL,Ib
+35: XOR rAX,Iz
+36: SEG=SS (Prefix)
+37: AAA (i64)
+38: CMP Eb,Gb
+39: CMP Ev,Gv
+3a: CMP Gb,Eb
+3b: CMP Gv,Ev
+3c: CMP AL,Ib
+3d: CMP rAX,Iz
+3e: SEG=DS (Prefix)
+3f: AAS (i64)
+# 0x40 - 0x4f
+40: INC eAX (i64) | REX (o64)
+41: INC eCX (i64) | REX.B (o64)
+42: INC eDX (i64) | REX.X (o64)
+43: INC eBX (i64) | REX.XB (o64)
+44: INC eSP (i64) | REX.R (o64)
+45: INC eBP (i64) | REX.RB (o64)
+46: INC eSI (i64) | REX.RX (o64)
+47: INC eDI (i64) | REX.RXB (o64)
+48: DEC eAX (i64) | REX.W (o64)
+49: DEC eCX (i64) | REX.WB (o64)
+4a: DEC eDX (i64) | REX.WX (o64)
+4b: DEC eBX (i64) | REX.WXB (o64)
+4c: DEC eSP (i64) | REX.WR (o64)
+4d: DEC eBP (i64) | REX.WRB (o64)
+4e: DEC eSI (i64) | REX.WRX (o64)
+4f: DEC eDI (i64) | REX.WRXB (o64)
+# 0x50 - 0x5f
+50: PUSH rAX/r8 (d64)
+51: PUSH rCX/r9 (d64)
+52: PUSH rDX/r10 (d64)
+53: PUSH rBX/r11 (d64)
+54: PUSH rSP/r12 (d64)
+55: PUSH rBP/r13 (d64)
+56: PUSH rSI/r14 (d64)
+57: PUSH rDI/r15 (d64)
+58: POP rAX/r8 (d64)
+59: POP rCX/r9 (d64)
+5a: POP rDX/r10 (d64)
+5b: POP rBX/r11 (d64)
+5c: POP rSP/r12 (d64)
+5d: POP rBP/r13 (d64)
+5e: POP rSI/r14 (d64)
+5f: POP rDI/r15 (d64)
+# 0x60 - 0x6f
+60: PUSHA/PUSHAD (i64)
+61: POPA/POPAD (i64)
+62: BOUND Gv,Ma (i64)
+63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
+64: SEG=FS (Prefix)
+65: SEG=GS (Prefix)
+66: Operand-Size (Prefix)
+67: Address-Size (Prefix)
+68: PUSH Iz (d64)
+69: IMUL Gv,Ev,Iz
+6a: PUSH Ib (d64)
+6b: IMUL Gv,Ev,Ib
+6c: INS/INSB Yb,DX
+6d: INS/INSW/INSD Yz,DX
+6e: OUTS/OUTSB DX,Xb
+6f: OUTS/OUTSW/OUTSD DX,Xz
+# 0x70 - 0x7f
+70: JO Jb
+71: JNO Jb
+72: JB/JNAE/JC Jb
+73: JNB/JAE/JNC Jb
+74: JZ/JE Jb
+75: JNZ/JNE Jb
+76: JBE/JNA Jb
+77: JNBE/JA Jb
+78: JS Jb
+79: JNS Jb
+7a: JP/JPE Jb
+7b: JNP/JPO Jb
+7c: JL/JNGE Jb
+7d: JNL/JGE Jb
+7e: JLE/JNG Jb
+7f: JNLE/JG Jb
+# 0x80 - 0x8f
+80: Grp1 Eb,Ib (1A)
+81: Grp1 Ev,Iz (1A)
+82: Grp1 Eb,Ib (1A),(i64)
+83: Grp1 Ev,Ib (1A)
+84: TEST Eb,Gb
+85: TEST Ev,Gv
+86: XCHG Eb,Gb
+87: XCHG Ev,Gv
+88: MOV Eb,Gb
+89: MOV Ev,Gv
+8a: MOV Gb,Eb
+8b: MOV Gv,Ev
+8c: MOV Ev,Sw
+8d: LEA Gv,M
+8e: MOV Sw,Ew
+8f: Grp1A (1A) | POP Ev (d64)
+# 0x90 - 0x9f
+90: NOP | PAUSE (F3) | XCHG r8,rAX
+91: XCHG rCX/r9,rAX
+92: XCHG rDX/r10,rAX
+93: XCHG rBX/r11,rAX
+94: XCHG rSP/r12,rAX
+95: XCHG rBP/r13,rAX
+96: XCHG rSI/r14,rAX
+97: XCHG rDI/r15,rAX
+98: CBW/CWDE/CDQE
+99: CWD/CDQ/CQO
+9a: CALLF Ap (i64)
+9b: FWAIT/WAIT
+9c: PUSHF/D/Q Fv (d64)
+9d: POPF/D/Q Fv (d64)
+9e: SAHF
+9f: LAHF
+# 0xa0 - 0xaf
+a0: MOV AL,Ob
+a1: MOV rAX,Ov
+a2: MOV Ob,AL
+a3: MOV Ov,rAX
+a4: MOVS/B Xb,Yb
+a5: MOVS/W/D/Q Xv,Yv
+a6: CMPS/B Xb,Yb
+a7: CMPS/W/D Xv,Yv
+a8: TEST AL,Ib
+a9: TEST rAX,Iz
+aa: STOS/B Yb,AL
+ab: STOS/W/D/Q Yv,rAX
+ac: LODS/B AL,Xb
+ad: LODS/W/D/Q rAX,Xv
+ae: SCAS/B AL,Yb
+af: SCAS/W/D/Q rAX,Xv
+# 0xb0 - 0xbf
+b0: MOV AL/R8L,Ib
+b1: MOV CL/R9L,Ib
+b2: MOV DL/R10L,Ib
+b3: MOV BL/R11L,Ib
+b4: MOV AH/R12L,Ib
+b5: MOV CH/R13L,Ib
+b6: MOV DH/R14L,Ib
+b7: MOV BH/R15L,Ib
+b8: MOV rAX/r8,Iv
+b9: MOV rCX/r9,Iv
+ba: MOV rDX/r10,Iv
+bb: MOV rBX/r11,Iv
+bc: MOV rSP/r12,Iv
+bd: MOV rBP/r13,Iv
+be: MOV rSI/r14,Iv
+bf: MOV rDI/r15,Iv
+# 0xc0 - 0xcf
+c0: Grp2 Eb,Ib (1A)
+c1: Grp2 Ev,Ib (1A)
+c2: RETN Iw (f64)
+c3: RETN
+c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
+c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
+c6: Grp11 Eb,Ib (1A)
+c7: Grp11 Ev,Iz (1A)
+c8: ENTER Iw,Ib
+c9: LEAVE (d64)
+ca: RETF Iw
+cb: RETF
+cc: INT3
+cd: INT Ib
+ce: INTO (i64)
+cf: IRET/D/Q
+# 0xd0 - 0xdf
+d0: Grp2 Eb,1 (1A)
+d1: Grp2 Ev,1 (1A)
+d2: Grp2 Eb,CL (1A)
+d3: Grp2 Ev,CL (1A)
+d4: AAM Ib (i64)
+d5: AAD Ib (i64)
+d6:
+d7: XLAT/XLATB
+d8: ESC
+d9: ESC
+da: ESC
+db: ESC
+dc: ESC
+dd: ESC
+de: ESC
+df: ESC
+# 0xe0 - 0xef
+e0: LOOPNE/LOOPNZ Jb (f64)
+e1: LOOPE/LOOPZ Jb (f64)
+e2: LOOP Jb (f64)
+e3: JrCXZ Jb (f64)
+e4: IN AL,Ib
+e5: IN eAX,Ib
+e6: OUT Ib,AL
+e7: OUT Ib,eAX
+e8: CALL Jz (f64)
+e9: JMP-near Jz (f64)
+ea: JMP-far Ap (i64)
+eb: JMP-short Jb (f64)
+ec: IN AL,DX
+ed: IN eAX,DX
+ee: OUT DX,AL
+ef: OUT DX,eAX
+# 0xf0 - 0xff
+f0: LOCK (Prefix)
+f1:
+f2: REPNE (Prefix)
+f3: REP/REPE (Prefix)
+f4: HLT
+f5: CMC
+f6: Grp3_1 Eb (1A)
+f7: Grp3_2 Ev (1A)
+f8: CLC
+f9: STC
+fa: CLI
+fb: STI
+fc: CLD
+fd: STD
+fe: Grp4 (1A)
+ff: Grp5 (1A)
+EndTable
+
+Table: 2-byte opcode (0x0f)
+Referrer: 2-byte escape
+AVXcode: 1
+# 0x0f 0x00-0x0f
+00: Grp6 (1A)
+01: Grp7 (1A)
+02: LAR Gv,Ew
+03: LSL Gv,Ew
+04:
+05: SYSCALL (o64)
+06: CLTS
+07: SYSRET (o64)
+08: INVD
+09: WBINVD
+0a:
+0b: UD2 (1B)
+0c:
+0d: NOP Ev | GrpP
+0e: FEMMS
+# 3DNow! uses the last imm byte as opcode extension.
+0f: 3DNow! Pq,Qq,Ib
+# 0x0f 0x10-0x1f
+10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
+11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
+12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
+13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
+14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
+15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
+16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
+17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
+18: Grp16 (1A)
+19:
+1a:
+1b:
+1c:
+1d:
+1e:
+1f: NOP Ev
+# 0x0f 0x20-0x2f
+20: MOV Rd,Cd
+21: MOV Rd,Dd
+22: MOV Cd,Rd
+23: MOV Dd,Rd
+24:
+25:
+26:
+27:
+28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
+29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
+2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
+2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
+2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128)
+2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128)
+# 0x0f 0x30-0x3f
+30: WRMSR
+31: RDTSC
+32: RDMSR
+33: RDPMC
+34: SYSENTER
+35: SYSEXIT
+36:
+37: GETSEC
+38: escape # 3-byte escape 1
+39:
+3a: escape # 3-byte escape 2
+3b:
+3c:
+3d:
+3e:
+3f:
+# 0x0f 0x40-0x4f
+40: CMOVO Gv,Ev
+41: CMOVNO Gv,Ev
+42: CMOVB/C/NAE Gv,Ev
+43: CMOVAE/NB/NC Gv,Ev
+44: CMOVE/Z Gv,Ev
+45: CMOVNE/NZ Gv,Ev
+46: CMOVBE/NA Gv,Ev
+47: CMOVA/NBE Gv,Ev
+48: CMOVS Gv,Ev
+49: CMOVNS Gv,Ev
+4a: CMOVP/PE Gv,Ev
+4b: CMOVNP/PO Gv,Ev
+4c: CMOVL/NGE Gv,Ev
+4d: CMOVNL/GE Gv,Ev
+4e: CMOVLE/NG Gv,Ev
+4f: CMOVNLE/G Gv,Ev
+# 0x0f 0x50-0x5f
+50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
+51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
+52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
+53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
+54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
+55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
+56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
+57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
+58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
+59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
+5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
+5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
+5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
+5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
+5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
+5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
+# 0x0f 0x60-0x6f
+60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
+61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
+62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
+63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
+64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
+65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
+66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
+67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
+68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
+69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
+6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
+6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
+6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
+6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
+6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
+6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
+# 0x0f 0x70-0x7f
+70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
+71: Grp12 (1A)
+72: Grp13 (1A)
+73: Grp14 (1A)
+74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
+75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
+76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
+77: emms/vzeroupper/vzeroall (VEX)
+78: VMREAD Ed/q,Gd/q
+79: VMWRITE Gd/q,Ed/q
+7a:
+7b:
+7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
+7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
+7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
+7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
+# 0x0f 0x80-0x8f
+80: JO Jz (f64)
+81: JNO Jz (f64)
+82: JB/JNAE/JC Jz (f64)
+83: JNB/JAE/JNC Jz (f64)
+84: JZ/JE Jz (f64)
+85: JNZ/JNE Jz (f64)
+86: JBE/JNA Jz (f64)
+87: JNBE/JA Jz (f64)
+88: JS Jz (f64)
+89: JNS Jz (f64)
+8a: JP/JPE Jz (f64)
+8b: JNP/JPO Jz (f64)
+8c: JL/JNGE Jz (f64)
+8d: JNL/JGE Jz (f64)
+8e: JLE/JNG Jz (f64)
+8f: JNLE/JG Jz (f64)
+# 0x0f 0x90-0x9f
+90: SETO Eb
+91: SETNO Eb
+92: SETB/C/NAE Eb
+93: SETAE/NB/NC Eb
+94: SETE/Z Eb
+95: SETNE/NZ Eb
+96: SETBE/NA Eb
+97: SETA/NBE Eb
+98: SETS Eb
+99: SETNS Eb
+9a: SETP/PE Eb
+9b: SETNP/PO Eb
+9c: SETL/NGE Eb
+9d: SETNL/GE Eb
+9e: SETLE/NG Eb
+9f: SETNLE/G Eb
+# 0x0f 0xa0-0xaf
+a0: PUSH FS (d64)
+a1: POP FS (d64)
+a2: CPUID
+a3: BT Ev,Gv
+a4: SHLD Ev,Gv,Ib
+a5: SHLD Ev,Gv,CL
+a6: GrpPDLK
+a7: GrpRNG
+a8: PUSH GS (d64)
+a9: POP GS (d64)
+aa: RSM
+ab: BTS Ev,Gv
+ac: SHRD Ev,Gv,Ib
+ad: SHRD Ev,Gv,CL
+ae: Grp15 (1A),(1C)
+af: IMUL Gv,Ev
+# 0x0f 0xb0-0xbf
+b0: CMPXCHG Eb,Gb
+b1: CMPXCHG Ev,Gv
+b2: LSS Gv,Mp
+b3: BTR Ev,Gv
+b4: LFS Gv,Mp
+b5: LGS Gv,Mp
+b6: MOVZX Gv,Eb
+b7: MOVZX Gv,Ew
+b8: JMPE | POPCNT Gv,Ev (F3)
+b9: Grp10 (1A)
+ba: Grp8 Ev,Ib (1A)
+bb: BTC Ev,Gv
+bc: BSF Gv,Ev
+bd: BSR Gv,Ev
+be: MOVSX Gv,Eb
+bf: MOVSX Gv,Ew
+# 0x0f 0xc0-0xcf
+c0: XADD Eb,Gb
+c1: XADD Ev,Gv
+c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
+c3: movnti Md/q,Gd/q
+c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
+c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
+c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
+c7: Grp9 (1A)
+c8: BSWAP RAX/EAX/R8/R8D
+c9: BSWAP RCX/ECX/R9/R9D
+ca: BSWAP RDX/EDX/R10/R10D
+cb: BSWAP RBX/EBX/R11/R11D
+cc: BSWAP RSP/ESP/R12/R12D
+cd: BSWAP RBP/EBP/R13/R13D
+ce: BSWAP RSI/ESI/R14/R14D
+cf: BSWAP RDI/EDI/R15/R15D
+# 0x0f 0xd0-0xdf
+d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
+d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
+d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
+d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
+d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
+d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
+d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
+d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
+d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
+da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
+db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
+dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
+dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
+de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
+df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xe0-0xef
+e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
+e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
+e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
+e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
+e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
+e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
+e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
+e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
+e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
+e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
+ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
+eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
+ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
+ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
+ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
+ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xf0-0xff
+f0: lddqu Vdq,Mdq (F2),(VEX)
+f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
+f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
+f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
+f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
+f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
+f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
+f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
+f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
+f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
+fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
+fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
+fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
+fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
+fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
+ff:
+EndTable
+
+Table: 3-byte opcode 1 (0x0f 0x38)
+Referrer: 3-byte escape 1
+AVXcode: 2
+# 0x0f 0x38 0x00-0x0f
+00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
+01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
+02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
+03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
+04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
+05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
+06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
+07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
+08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
+09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
+0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
+0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
+0c: Vpermilps /r (66),(oVEX)
+0d: Vpermilpd /r (66),(oVEX)
+0e: vtestps /r (66),(oVEX)
+0f: vtestpd /r (66),(oVEX)
+# 0x0f 0x38 0x10-0x1f
+10: pblendvb Vdq,Wdq (66)
+11:
+12:
+13:
+14: blendvps Vdq,Wdq (66)
+15: blendvpd Vdq,Wdq (66)
+16:
+17: ptest Vdq,Wdq (66),(VEX)
+18: vbroadcastss /r (66),(oVEX)
+19: vbroadcastsd /r (66),(oVEX),(o256)
+1a: vbroadcastf128 /r (66),(oVEX),(o256)
+1b:
+1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
+1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
+1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
+1f:
+# 0x0f 0x38 0x20-0x2f
+20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
+21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
+22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
+23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
+24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
+25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
+26:
+27:
+28: pmuldq Vdq,Wdq (66),(VEX),(o128)
+29: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
+2a: movntdqa Vdq,Mdq (66),(VEX),(o128)
+2b: packusdw Vdq,Wdq (66),(VEX),(o128)
+2c: vmaskmovps(ld) /r (66),(oVEX)
+2d: vmaskmovpd(ld) /r (66),(oVEX)
+2e: vmaskmovps(st) /r (66),(oVEX)
+2f: vmaskmovpd(st) /r (66),(oVEX)
+# 0x0f 0x38 0x30-0x3f
+30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
+31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
+32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
+33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
+34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
+35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
+36:
+37: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
+38: pminsb Vdq,Wdq (66),(VEX),(o128)
+39: pminsd Vdq,Wdq (66),(VEX),(o128)
+3a: pminuw Vdq,Wdq (66),(VEX),(o128)
+3b: pminud Vdq,Wdq (66),(VEX),(o128)
+3c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
+3d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
+3e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
+3f: pmaxud Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0x38 0x40-0x8f
+40: pmulld Vdq,Wdq (66),(VEX),(o128)
+41: phminposuw Vdq,Wdq (66),(VEX),(o128)
+80: INVEPT Gd/q,Mdq (66)
+81: INVPID Gd/q,Mdq (66)
+# 0x0f 0x38 0x90-0xbf (FMA)
+96: vfmaddsub132pd/ps /r (66),(VEX)
+97: vfmsubadd132pd/ps /r (66),(VEX)
+98: vfmadd132pd/ps /r (66),(VEX)
+99: vfmadd132sd/ss /r (66),(VEX),(o128)
+9a: vfmsub132pd/ps /r (66),(VEX)
+9b: vfmsub132sd/ss /r (66),(VEX),(o128)
+9c: vfnmadd132pd/ps /r (66),(VEX)
+9d: vfnmadd132sd/ss /r (66),(VEX),(o128)
+9e: vfnmsub132pd/ps /r (66),(VEX)
+9f: vfnmsub132sd/ss /r (66),(VEX),(o128)
+a6: vfmaddsub213pd/ps /r (66),(VEX)
+a7: vfmsubadd213pd/ps /r (66),(VEX)
+a8: vfmadd213pd/ps /r (66),(VEX)
+a9: vfmadd213sd/ss /r (66),(VEX),(o128)
+aa: vfmsub213pd/ps /r (66),(VEX)
+ab: vfmsub213sd/ss /r (66),(VEX),(o128)
+ac: vfnmadd213pd/ps /r (66),(VEX)
+ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
+ae: vfnmsub213pd/ps /r (66),(VEX)
+af: vfnmsub213sd/ss /r (66),(VEX),(o128)
+b6: vfmaddsub231pd/ps /r (66),(VEX)
+b7: vfmsubadd231pd/ps /r (66),(VEX)
+b8: vfmadd231pd/ps /r (66),(VEX)
+b9: vfmadd231sd/ss /r (66),(VEX),(o128)
+ba: vfmsub231pd/ps /r (66),(VEX)
+bb: vfmsub231sd/ss /r (66),(VEX),(o128)
+bc: vfnmadd231pd/ps /r (66),(VEX)
+bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
+be: vfnmsub231pd/ps /r (66),(VEX)
+bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
+# 0x0f 0x38 0xc0-0xff
+db: aesimc Vdq,Wdq (66),(VEX),(o128)
+dc: aesenc Vdq,Wdq (66),(VEX),(o128)
+dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
+de: aesdec Vdq,Wdq (66),(VEX),(o128)
+df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
+f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
+f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x3a)
+Referrer: 3-byte escape 2
+AVXcode: 3
+# 0x0f 0x3a 0x00-0xff
+04: vpermilps /r,Ib (66),(oVEX)
+05: vpermilpd /r,Ib (66),(oVEX)
+06: vperm2f128 /r,Ib (66),(oVEX),(o256)
+08: roundps Vdq,Wdq,Ib (66),(VEX)
+09: roundpd Vdq,Wdq,Ib (66),(VEX)
+0a: roundss Vss,Wss,Ib (66),(VEX),(o128)
+0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
+0c: blendps Vdq,Wdq,Ib (66),(VEX)
+0d: blendpd Vdq,Wdq,Ib (66),(VEX)
+0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
+0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
+14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
+15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
+16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
+17: extractps Ed,Vdq,Ib (66),(VEX),(o128)
+18: vinsertf128 /r,Ib (66),(oVEX),(o256)
+19: vextractf128 /r,Ib (66),(oVEX),(o256)
+20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
+21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
+22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
+40: dpps Vdq,Wdq,Ib (66),(VEX)
+41: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
+42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
+44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
+4a: vblendvps /r,Ib (66),(oVEX)
+4b: vblendvpd /r,Ib (66),(oVEX)
+4c: vpblendvb /r,Ib (66),(oVEX),(o128)
+60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
+61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
+62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
+63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
+df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
+EndTable
+
+GrpTable: Grp1
+0: ADD
+1: OR
+2: ADC
+3: SBB
+4: AND
+5: SUB
+6: XOR
+7: CMP
+EndTable
+
+GrpTable: Grp1A
+0: POP
+EndTable
+
+GrpTable: Grp2
+0: ROL
+1: ROR
+2: RCL
+3: RCR
+4: SHL/SAL
+5: SHR
+6:
+7: SAR
+EndTable
+
+GrpTable: Grp3_1
+0: TEST Eb,Ib
+1:
+2: NOT Eb
+3: NEG Eb
+4: MUL AL,Eb
+5: IMUL AL,Eb
+6: DIV AL,Eb
+7: IDIV AL,Eb
+EndTable
+
+GrpTable: Grp3_2
+0: TEST Ev,Iz
+1:
+2: NOT Ev
+3: NEG Ev
+4: MUL rAX,Ev
+5: IMUL rAX,Ev
+6: DIV rAX,Ev
+7: IDIV rAX,Ev
+EndTable
+
+GrpTable: Grp4
+0: INC Eb
+1: DEC Eb
+EndTable
+
+GrpTable: Grp5
+0: INC Ev
+1: DEC Ev
+2: CALLN Ev (f64)
+3: CALLF Ep
+4: JMPN Ev (f64)
+5: JMPF Ep
+6: PUSH Ev (d64)
+7:
+EndTable
+
+GrpTable: Grp6
+0: SLDT Rv/Mw
+1: STR Rv/Mw
+2: LLDT Ew
+3: LTR Ew
+4: VERR Ew
+5: VERW Ew
+EndTable
+
+GrpTable: Grp7
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
+3: LIDT Ms
+4: SMSW Mw/Rv
+5:
+6: LMSW Ew
+7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
+EndTable
+
+GrpTable: Grp8
+4: BT
+5: BTS
+6: BTR
+7: BTC
+EndTable
+
+GrpTable: Grp9
+1: CMPXCHG8B/16B Mq/Mdq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
+7: VMPTRST Mq
+EndTable
+
+GrpTable: Grp10
+EndTable
+
+GrpTable: Grp11
+0: MOV
+EndTable
+
+GrpTable: Grp12
+2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
+4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
+6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp13
+2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
+4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
+6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp14
+2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
+3: psrldq Udq,Ib (66),(11B),(VEX),(o128)
+6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
+7: pslldq Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp15
+0: fxsave
+1: fxstor
+2: ldmxcsr (VEX)
+3: stmxcsr (VEX)
+4: XSAVE
+5: XRSTOR | lfence (11B)
+6: mfence (11B)
+7: clflush | sfence (11B)
+EndTable
+
+GrpTable: Grp16
+0: prefetch NTA
+1: prefetch T0
+2: prefetch T1
+3: prefetch T2
+EndTable
+
+# AMD's Prefetch Group
+GrpTable: GrpP
+0: PREFETCH
+1: PREFETCHW
+EndTable
+
+GrpTable: GrpPDLK
+0: MONTMUL
+1: XSHA1
+2: XSHA2
+EndTable
+
+GrpTable: GrpRNG
+0: xstore-rng
+1: xcrypt-ecb
+2: xcrypt-cbc
+4: xcrypt-cfb
+5: xcrypt-ofb
+EndTable
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index eefdeee8a87..06630d26e56 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,10 @@
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o gup.o
+ pat.o pgtable.o physaddr.o gup.o setup_nx.o
+
+# Make sure __phys_addr has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_physaddr.o := $(nostackp)
+CFLAGS_setup_nx.o := $(nostackp)
obj-$(CONFIG_SMP) += tlb.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61b41ca3b5a..d0474ad2a6e 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs)
return 0;
}
-
-#ifdef CONFIG_X86_64
-/*
- * Need to defined our own search_extable on X86_64 to work around
- * a B stepping K8 bug.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- /* B stepping K8 bug */
- if ((value >> 32) == 0)
- value |= 0xffffffffUL << 32;
-
- while (first <= last) {
- const struct exception_table_entry *mid;
- long diff;
-
- mid = (last - first) / 2 + first;
- diff = mid->insn - value;
- if (diff == 0)
- return mid;
- else if (diff < 0)
- first = mid+1;
- else
- last = mid-1;
- }
- return NULL;
-}
-#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 775a020990a..8f4e2ac9392 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,7 +10,7 @@
#include <linux/bootmem.h> /* max_low_pfn */
#include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
-#include <linux/perf_counter.h> /* perf_swcounter_event */
+#include <linux/perf_event.h> /* perf_sw_event */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
-static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+static inline int __kprobes
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
}
-static inline int notify_page_fault(struct pt_regs *regs)
+static inline int __kprobes notify_page_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -167,6 +168,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
info.si_errno = 0;
info.si_code = si_code;
info.si_addr = (void __user *)address;
+ info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
force_sig_info(si_signo, &info, tsk);
}
@@ -239,7 +241,7 @@ void vmalloc_sync_all(void)
*
* Handle a fault on the vmalloc or module mapping area
*/
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
@@ -356,7 +358,7 @@ void vmalloc_sync_all(void)
*
* This assumes no large pages in there.
*/
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
pud_t *pud, *pud_ref;
@@ -790,10 +792,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
}
static void
-do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+ unsigned int fault)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
+ int code = BUS_ADRERR;
up_read(&mm->mmap_sem);
@@ -809,7 +813,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+#ifdef CONFIG_MEMORY_FAILURE
+ if (fault & VM_FAULT_HWPOISON) {
+ printk(KERN_ERR
+ "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+ tsk->comm, tsk->pid, address);
+ code = BUS_MCEERR_AR;
+ }
+#endif
+ force_sig_info_fault(SIGBUS, code, address, tsk);
}
static noinline void
@@ -819,8 +831,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) {
out_of_memory(regs, error_code, address);
} else {
- if (fault & VM_FAULT_SIGBUS)
- do_sigbus(regs, error_code, address);
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+ do_sigbus(regs, error_code, address, fault);
else
BUG();
}
@@ -849,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static noinline int
+static noinline __kprobes int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
@@ -1017,7 +1029,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
- perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
/*
* If we're in an interrupt, have no user context or are running
@@ -1114,11 +1126,11 @@ good_area:
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
regs, address);
} else {
tsk->min_flt++;
- perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
regs, address);
}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 1617958a380..63a6ba66cbe 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
EXPORT_SYMBOL(kmap_atomic_prot);
+EXPORT_SYMBOL(kmap_atomic_to_page);
void __init set_highmem_pages_init(void)
{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 0607119cef9..73ffd5536f6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -28,69 +28,6 @@ int direct_gbpages
#endif
;
-int nx_enabled;
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec = on|off
- *
- * Control non-executable mappings for processes.
- *
- * on Enable
- * off Disable
- */
-static int __init noexec_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- __supported_pte_mask |= _PAGE_NX;
- disable_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- disable_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- }
- return 0;
-}
-early_param("noexec", noexec_setup);
-#endif
-
-#ifdef CONFIG_X86_PAE
-static void __init set_nx(void)
-{
- unsigned int v[4], l, h;
-
- if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
- cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
- if ((v[3] & (1 << 20)) && !disable_nx) {
- rdmsr(MSR_EFER, l, h);
- l |= EFER_NX;
- wrmsr(MSR_EFER, l, h);
- nx_enabled = 1;
- __supported_pte_mask |= _PAGE_NX;
- }
- }
-}
-#else
-static inline void set_nx(void)
-{
-}
-#endif
-
-#ifdef CONFIG_X86_64
-void __cpuinit check_efer(void)
-{
- unsigned long efer;
-
- rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || disable_nx)
- __supported_pte_mask &= ~_PAGE_NX;
-}
-#endif
-
static void __init find_early_table_space(unsigned long end, int use_pse,
int use_gbpages)
{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3cd7711bb94..30938c1d8d5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
if (after_bootmem)
- pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
else
pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
@@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
#endif
if (!page_table)
page_table =
- (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
} else
page_table = (pte_t *)alloc_low_page();
@@ -857,8 +857,6 @@ static void __init test_wp_bit(void)
}
}
-static struct kcore_list kcore_mem, kcore_vmalloc;
-
void __init mem_init(void)
{
int codesize, reservedpages, datasize, initsize;
@@ -886,13 +884,9 @@ void __init mem_init(void)
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
- VMALLOC_END-VMALLOC_START);
-
printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
- (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+ nr_free_pages() << (PAGE_SHIFT-10),
num_physpages << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ea56b8cbb6a..5a4398a6006 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,8 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif /* CONFIG_MEMORY_HOTPLUG */
-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
- kcore_modules, kcore_vsyscall;
+static struct kcore_list kcore_vsyscall;
void __init mem_init(void)
{
@@ -677,17 +676,12 @@ void __init mem_init(void)
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
/* Register memory areas for /proc/kcore */
- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
- VMALLOC_END-VMALLOC_START);
- kclist_add(&kcore_kernel, &_stext, _end - _stext);
- kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
- VSYSCALL_END - VSYSCALL_START);
+ VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
"%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
- (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+ nr_free_pages() << (PAGE_SHIFT-10),
max_pfn << (PAGE_SHIFT-10),
codesize >> 10,
absent_pages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index fe6f84ca121..84e236ce76b 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -21,7 +21,7 @@
#include <linux/module.h>
#include <linux/highmem.h>
-int is_io_mapping_possible(resource_size_t base, unsigned long size)
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
{
#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
/* There is no way to map greater than 1 << 32 address without PAE */
@@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
#endif
return 1;
}
-EXPORT_SYMBOL_GPL(is_io_mapping_possible);
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+ unsigned long flag = _PAGE_CACHE_WC;
+ int ret;
+
+ if (!is_io_mapping_possible(base, size))
+ return -EINVAL;
+
+ ret = io_reserve_memtype(base, base + size, &flag);
+ if (ret)
+ return ret;
+
+ *prot = __pgprot(__PAGE_KERNEL | flag);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void
+iomap_free(resource_size_t base, unsigned long size)
+{
+ io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 8a450930834..2feb9bdedaa 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,77 +22,7 @@
#include <asm/pgalloc.h>
#include <asm/pat.h>
-static inline int phys_addr_valid(resource_size_t addr)
-{
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- return !(addr >> boot_cpu_data.x86_phys_bits);
-#else
- return 1;
-#endif
-}
-
-#ifdef CONFIG_X86_64
-
-unsigned long __phys_addr(unsigned long x)
-{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
- x += phys_base;
- } else {
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(!phys_addr_valid(x));
- }
- return x;
-}
-EXPORT_SYMBOL(__phys_addr);
-
-bool __virt_addr_valid(unsigned long x)
-{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- if (x >= KERNEL_IMAGE_SIZE)
- return false;
- x += phys_base;
- } else {
- if (x < PAGE_OFFSET)
- return false;
- x -= PAGE_OFFSET;
- if (!phys_addr_valid(x))
- return false;
- }
-
- return pfn_valid(x >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(__virt_addr_valid);
-
-#else
-
-#ifdef CONFIG_DEBUG_VIRTUAL
-unsigned long __phys_addr(unsigned long x)
-{
- /* VMALLOC_* aren't constants */
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
- return x - PAGE_OFFSET;
-}
-EXPORT_SYMBOL(__phys_addr);
-#endif
-
-bool __virt_addr_valid(unsigned long x)
-{
- if (x < PAGE_OFFSET)
- return false;
- if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
- return false;
- if (x >= FIXADDR_START)
- return false;
- return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(__virt_addr_valid);
-
-#endif
+#include "physaddr.h"
int page_is_ram(unsigned long pagenr)
{
@@ -228,30 +158,19 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
prot_val, &new_prot_val);
if (retval) {
- pr_debug("Warning: reserve_memtype returned %d\n", retval);
+ printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
return NULL;
}
if (prot_val != new_prot_val) {
- /*
- * Do not fallback to certain memory types with certain
- * requested type:
- * - request is uc-, return cannot be write-back
- * - request is uc-, return cannot be write-combine
- * - request is write-combine, return cannot be write-back
- */
- if ((prot_val == _PAGE_CACHE_UC_MINUS &&
- (new_prot_val == _PAGE_CACHE_WB ||
- new_prot_val == _PAGE_CACHE_WC)) ||
- (prot_val == _PAGE_CACHE_WC &&
- new_prot_val == _PAGE_CACHE_WB)) {
- pr_debug(
+ if (!is_new_memtype_allowed(phys_addr, size,
+ prot_val, new_prot_val)) {
+ printk(KERN_ERR
"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
(unsigned long long)phys_addr,
(unsigned long long)(phys_addr + size),
prot_val, new_prot_val);
- free_memtype(phys_addr, phys_addr + size);
- return NULL;
+ goto err_free_memtype;
}
prot_val = new_prot_val;
}
@@ -277,26 +196,25 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
*/
area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (!area)
- return NULL;
+ goto err_free_memtype;
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
- if (kernel_map_sync_memtype(phys_addr, size, prot_val)) {
- free_memtype(phys_addr, phys_addr + size);
- free_vm_area(area);
- return NULL;
- }
+ if (kernel_map_sync_memtype(phys_addr, size, prot_val))
+ goto err_free_area;
- if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
- free_memtype(phys_addr, phys_addr + size);
- free_vm_area(area);
- return NULL;
- }
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
+ goto err_free_area;
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
return ret_addr;
+err_free_area:
+ free_vm_area(area);
+err_free_memtype:
+ free_memtype(phys_addr, phys_addr + size);
+ return NULL;
}
/**
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 528bf954eb7..8cc18334414 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs)
BUG_ON(!irqs_disabled());
- if (data->balance == 0)
- return;
-
if (unlikely(data->balance != 1)) {
kmemcheck_show_all();
kmemcheck_error_save_bug(regs);
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index e773b6bd007..3f66b82076a 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -1,7 +1,6 @@
#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/module.h>
#include <asm/page.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917..11a4ad4d625 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
struct die_args *arg = args;
if (val == DIE_DEBUG && (arg->err & DR_STEP))
- if (post_kmmio_handler(arg->err, arg->regs) == 1)
+ if (post_kmmio_handler(arg->err, arg->regs) == 1) {
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
return NOTIFY_STOP;
+ }
return NOTIFY_DONE;
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 16582960056..c8191defc38 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -29,13 +29,26 @@
#include <linux/random.h>
#include <linux/limits.h>
#include <linux/sched.h>
+#include <asm/elf.h>
+
+static unsigned int stack_maxrandom_size(void)
+{
+ unsigned int max = 0;
+ if ((current->flags & PF_RANDOMIZE) &&
+ !(current->personality & ADDR_NO_RANDOMIZE)) {
+ max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
+ }
+
+ return max;
+}
+
/*
* Top of mmap area (just below the process stack).
*
- * Leave an at least ~128 MB hole.
+ * Leave an at least ~128 MB hole with possible stack randomization.
*/
-#define MIN_GAP (128*1024*1024)
+#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)
/*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962d..dd38bfbefd1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
+#include <linux/percpu.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -143,6 +144,7 @@ void clflush_cache_range(void *vaddr, unsigned int size)
mb();
}
+EXPORT_SYMBOL_GPL(clflush_cache_range);
static void __cpa_flush_all(void *arg)
{
@@ -686,7 +688,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
- unsigned long vaddr, remapped;
+ unsigned long vaddr;
int ret;
if (cpa->pfn >= max_pfn_mapped)
@@ -744,24 +746,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
}
#endif
- /*
- * If the PMD page was partially used for per-cpu remapping,
- * the recycled area needs to be split and modified. Because
- * the area is always proper subset of a PMD page
- * cpa->numpages is guaranteed to be 1 for these areas, so
- * there's no need to loop over and check for further remaps.
- */
- remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
- if (remapped) {
- WARN_ON(cpa->numpages > 1);
- alias_cpa = *cpa;
- alias_cpa.vaddr = &remapped;
- alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
- ret = __change_page_attr_set_clr(&alias_cpa, 0);
- if (ret)
- return ret;
- }
-
return 0;
}
@@ -822,6 +806,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
{
struct cpa_data cpa;
int ret, cache, checkalias;
+ unsigned long baddr = 0;
/*
* Check, if we are requested to change a not supported
@@ -853,6 +838,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
*/
WARN_ON_ONCE(1);
}
+ /*
+ * Save address for cache flush. *addr is modified in the call
+ * to __change_page_attr_set_clr() below.
+ */
+ baddr = *addr;
}
/* Must avoid aliasing mappings in the highmem code */
@@ -900,7 +890,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cpa_flush_array(addr, numpages, cache,
cpa.flags, pages);
} else
- cpa_flush_range(*addr, numpages, cache);
+ cpa_flush_range(baddr, numpages, cache);
} else
cpa_flush_all(cache);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index b2f7d3e59b8..e78cd0ec2bc 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -15,6 +15,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/rbtree.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
@@ -80,6 +81,7 @@ enum {
void pat_init(void)
{
u64 pat;
+ bool boot_cpu = !boot_pat_state;
if (!pat_enabled)
return;
@@ -121,8 +123,10 @@ void pat_init(void)
rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
wrmsrl(MSR_IA32_CR_PAT, pat);
- printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
- smp_processor_id(), boot_pat_state, pat);
+
+ if (boot_cpu)
+ printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+ smp_processor_id(), boot_pat_state, pat);
}
#undef PAT
@@ -148,11 +152,10 @@ static char *cattr_name(unsigned long flags)
* areas). All the aliases have the same cache attributes of course.
* Zero attributes are represented as holes.
*
- * Currently the data structure is a list because the number of mappings
- * are expected to be relatively small. If this should be a problem
- * it could be changed to a rbtree or similar.
+ * The data structure is a list that is also organized as an rbtree
+ * sorted on the start address of memtype range.
*
- * memtype_lock protects the whole list.
+ * memtype_lock protects both the linear list and rbtree.
*/
struct memtype {
@@ -160,11 +163,53 @@ struct memtype {
u64 end;
unsigned long type;
struct list_head nd;
+ struct rb_node rb;
};
+static struct rb_root memtype_rbroot = RB_ROOT;
static LIST_HEAD(memtype_list);
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
+{
+ struct rb_node *node = root->rb_node;
+ struct memtype *last_lower = NULL;
+
+ while (node) {
+ struct memtype *data = container_of(node, struct memtype, rb);
+
+ if (data->start < start) {
+ last_lower = data;
+ node = node->rb_right;
+ } else if (data->start > start) {
+ node = node->rb_left;
+ } else
+ return data;
+ }
+
+ /* Will return NULL if there is no entry with its start <= start */
+ return last_lower;
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
+{
+ struct rb_node **new = &(root->rb_node);
+ struct rb_node *parent = NULL;
+
+ while (*new) {
+ struct memtype *this = container_of(*new, struct memtype, rb);
+
+ parent = *new;
+ if (data->start <= this->start)
+ new = &((*new)->rb_left);
+ else if (data->start > this->start)
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&data->rb, parent, new);
+ rb_insert_color(&data->rb, root);
+}
+
/*
* Does intersection of PAT memory type and MTRR memory type and returns
* the resulting memory type as PAT understands it.
@@ -218,9 +263,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
return -EBUSY;
}
-static struct memtype *cached_entry;
-static u64 cached_start;
-
static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
{
int ram_page = 0, not_rampage = 0;
@@ -249,63 +291,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
}
/*
- * For RAM pages, mark the pages as non WB memory type using
- * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
- * set_memory_wc() on a RAM page at a time before marking it as WB again.
- * This is ok, because only one driver will be owning the page and
- * doing set_memory_*() calls.
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * Here we do two pass:
+ * - Find the memtype of all the pages in the range, look for any conflicts
+ * - In case of no conflicts, set the new memtype for pages in the range
*
- * For now, we use PageNonWB to track that the RAM page is being mapped
- * as non WB. In future, we will have to use one more flag
- * (or some other mechanism in page_struct) to distinguish between
- * UC and WC mapping.
+ * Caller must hold memtype_lock for atomicity.
*/
static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
unsigned long *new_type)
{
struct page *page;
- u64 pfn, end_pfn;
+ u64 pfn;
+
+ if (req_type == _PAGE_CACHE_UC) {
+ /* We do not support strong UC */
+ WARN_ON_ONCE(1);
+ req_type = _PAGE_CACHE_UC_MINUS;
+ }
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
- page = pfn_to_page(pfn);
- if (page_mapped(page) || PageNonWB(page))
- goto out;
+ unsigned long type;
- SetPageNonWB(page);
+ page = pfn_to_page(pfn);
+ type = get_page_memtype(page);
+ if (type != -1) {
+ printk(KERN_INFO "reserve_ram_pages_type failed "
+ "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
+ start, end, type, req_type);
+ if (new_type)
+ *new_type = type;
+
+ return -EBUSY;
+ }
}
- return 0;
-out:
- end_pfn = pfn;
- for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ if (new_type)
+ *new_type = req_type;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
- ClearPageNonWB(page);
+ set_page_memtype(page, req_type);
}
-
- return -EINVAL;
+ return 0;
}
static int free_ram_pages_type(u64 start, u64 end)
{
struct page *page;
- u64 pfn, end_pfn;
+ u64 pfn;
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
- if (page_mapped(page) || !PageNonWB(page))
- goto out;
-
- ClearPageNonWB(page);
+ set_page_memtype(page, -1);
}
return 0;
-
-out:
- end_pfn = pfn;
- for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
- page = pfn_to_page(pfn);
- SetPageNonWB(page);
- }
- return -EINVAL;
}
/*
@@ -339,6 +379,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (new_type) {
if (req_type == -1)
*new_type = _PAGE_CACHE_WB;
+ else if (req_type == _PAGE_CACHE_WC)
+ *new_type = _PAGE_CACHE_UC_MINUS;
else
*new_type = req_type & _PAGE_CACHE_MASK;
}
@@ -364,11 +406,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
*new_type = actual_type;
is_range_ram = pat_pagerange_is_ram(start, end);
- if (is_range_ram == 1)
- return reserve_ram_pages_type(start, end, req_type,
- new_type);
- else if (is_range_ram < 0)
+ if (is_range_ram == 1) {
+
+ spin_lock(&memtype_lock);
+ err = reserve_ram_pages_type(start, end, req_type, new_type);
+ spin_unlock(&memtype_lock);
+
+ return err;
+ } else if (is_range_ram < 0) {
return -EINVAL;
+ }
new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
@@ -380,17 +427,11 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
spin_lock(&memtype_lock);
- if (cached_entry && start >= cached_start)
- entry = cached_entry;
- else
- entry = list_entry(&memtype_list, struct memtype, nd);
-
/* Search for existing mapping that overlaps the current range */
where = NULL;
- list_for_each_entry_continue(entry, &memtype_list, nd) {
+ list_for_each_entry(entry, &memtype_list, nd) {
if (end <= entry->start) {
where = entry->nd.prev;
- cached_entry = list_entry(where, struct memtype, nd);
break;
} else if (start <= entry->start) { /* end > entry->start */
err = chk_conflict(new, entry, new_type);
@@ -398,8 +439,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
dprintk("Overlap at 0x%Lx-0x%Lx\n",
entry->start, entry->end);
where = entry->nd.prev;
- cached_entry = list_entry(where,
- struct memtype, nd);
}
break;
} else if (start < entry->end) { /* start > entry->start */
@@ -407,8 +446,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (!err) {
dprintk("Overlap at 0x%Lx-0x%Lx\n",
entry->start, entry->end);
- cached_entry = list_entry(entry->nd.prev,
- struct memtype, nd);
/*
* Move to right position in the linked
@@ -436,13 +473,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
return err;
}
- cached_start = start;
-
if (where)
list_add(&new->nd, where);
else
list_add_tail(&new->nd, &memtype_list);
+ memtype_rb_insert(&memtype_rbroot, new);
+
spin_unlock(&memtype_lock);
dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -454,7 +491,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
int free_memtype(u64 start, u64 end)
{
- struct memtype *entry;
+ struct memtype *entry, *saved_entry;
int err = -EINVAL;
int is_range_ram;
@@ -466,23 +503,58 @@ int free_memtype(u64 start, u64 end)
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
- if (is_range_ram == 1)
- return free_ram_pages_type(start, end);
- else if (is_range_ram < 0)
+ if (is_range_ram == 1) {
+
+ spin_lock(&memtype_lock);
+ err = free_ram_pages_type(start, end);
+ spin_unlock(&memtype_lock);
+
+ return err;
+ } else if (is_range_ram < 0) {
return -EINVAL;
+ }
spin_lock(&memtype_lock);
- list_for_each_entry(entry, &memtype_list, nd) {
+
+ entry = memtype_rb_search(&memtype_rbroot, start);
+ if (unlikely(entry == NULL))
+ goto unlock_ret;
+
+ /*
+ * Saved entry points to an entry with start same or less than what
+ * we searched for. Now go through the list in both directions to look
+ * for the entry that matches with both start and end, with list stored
+ * in sorted start address
+ */
+ saved_entry = entry;
+ list_for_each_entry_from(entry, &memtype_list, nd) {
if (entry->start == start && entry->end == end) {
- if (cached_entry == entry || cached_start == start)
- cached_entry = NULL;
+ rb_erase(&entry->rb, &memtype_rbroot);
+ list_del(&entry->nd);
+ kfree(entry);
+ err = 0;
+ break;
+ } else if (entry->start > start) {
+ break;
+ }
+ }
+ if (!err)
+ goto unlock_ret;
+
+ entry = saved_entry;
+ list_for_each_entry_reverse(entry, &memtype_list, nd) {
+ if (entry->start == start && entry->end == end) {
+ rb_erase(&entry->rb, &memtype_rbroot);
list_del(&entry->nd);
kfree(entry);
err = 0;
break;
+ } else if (entry->start < start) {
+ break;
}
}
+unlock_ret:
spin_unlock(&memtype_lock);
if (err) {
@@ -496,6 +568,101 @@ int free_memtype(u64 start, u64 end)
}
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
+ * _PAGE_CACHE_UC
+ */
+static unsigned long lookup_memtype(u64 paddr)
+{
+ int rettype = _PAGE_CACHE_WB;
+ struct memtype *entry;
+
+ if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
+ return rettype;
+
+ if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+ struct page *page;
+ spin_lock(&memtype_lock);
+ page = pfn_to_page(paddr >> PAGE_SHIFT);
+ rettype = get_page_memtype(page);
+ spin_unlock(&memtype_lock);
+ /*
+ * -1 from get_page_memtype() implies RAM page is in its
+ * default state and not reserved, and hence of type WB
+ */
+ if (rettype == -1)
+ rettype = _PAGE_CACHE_WB;
+
+ return rettype;
+ }
+
+ spin_lock(&memtype_lock);
+
+ entry = memtype_rb_search(&memtype_rbroot, paddr);
+ if (entry != NULL)
+ rettype = entry->type;
+ else
+ rettype = _PAGE_CACHE_UC_MINUS;
+
+ spin_unlock(&memtype_lock);
+ return rettype;
+}
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+ unsigned long *type)
+{
+ resource_size_t size = end - start;
+ unsigned long req_type = *type;
+ unsigned long new_type;
+ int ret;
+
+ WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+ ret = reserve_memtype(start, end, req_type, &new_type);
+ if (ret)
+ goto out_err;
+
+ if (!is_new_memtype_allowed(start, size, req_type, new_type))
+ goto out_free;
+
+ if (kernel_map_sync_memtype(start, size, new_type) < 0)
+ goto out_free;
+
+ *type = new_type;
+ return 0;
+
+out_free:
+ free_memtype(start, end);
+ ret = -EBUSY;
+out_err:
+ return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+ free_memtype(start, end);
+}
+
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
@@ -577,7 +744,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
- if (!pat_enabled || base >= __pa(high_memory))
+ if (base >= __pa(high_memory))
return 0;
id_sz = (__pa(high_memory) < base + size) ?
@@ -612,11 +779,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
/*
- * reserve_pfn_range() doesn't support RAM pages. Maintain the current
- * behavior with RAM pages by returning success.
+ * reserve_pfn_range() for RAM pages. We do not refcount to keep
+ * track of number of mappings of RAM pages. We can assert that
+ * the type requested matches the type of first page in the range.
*/
- if (is_ram != 0)
+ if (is_ram) {
+ if (!pat_enabled)
+ return 0;
+
+ flags = lookup_memtype(paddr);
+ if (want_flags != flags) {
+ printk(KERN_WARNING
+ "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size),
+ cattr_name(flags));
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ flags);
+ }
return 0;
+ }
ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
if (ret)
@@ -678,14 +863,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
- if (!pat_enabled)
- return 0;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/*
* reserve the whole chunk covered by vma. We need the
@@ -713,23 +890,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long size)
{
+ unsigned long flags;
resource_size_t paddr;
unsigned long vma_size = vma->vm_end - vma->vm_start;
- if (!pat_enabled)
- return 0;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/* reserve the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
return reserve_pfn_range(paddr, vma_size, prot, 0);
}
+ if (!pat_enabled)
+ return 0;
+
+ /* for vm_insert_pfn and friends, we set prot based on lookup */
+ flags = lookup_memtype(pfn << PAGE_SHIFT);
+ *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+ flags);
+
return 0;
}
@@ -744,14 +922,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
resource_size_t paddr;
unsigned long vma_size = vma->vm_end - vma->vm_start;
- if (!pat_enabled)
- return;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/* free the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
new file mode 100644
index 00000000000..d2e2735327b
--- /dev/null
+++ b/arch/x86/mm/physaddr.c
@@ -0,0 +1,70 @@
+#include <linux/mmdebug.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+
+#include "physaddr.h"
+
+#ifdef CONFIG_X86_64
+
+unsigned long __phys_addr(unsigned long x)
+{
+ if (x >= __START_KERNEL_map) {
+ x -= __START_KERNEL_map;
+ VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
+ x += phys_base;
+ } else {
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ x -= PAGE_OFFSET;
+ VIRTUAL_BUG_ON(!phys_addr_valid(x));
+ }
+ return x;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+bool __virt_addr_valid(unsigned long x)
+{
+ if (x >= __START_KERNEL_map) {
+ x -= __START_KERNEL_map;
+ if (x >= KERNEL_IMAGE_SIZE)
+ return false;
+ x += phys_base;
+ } else {
+ if (x < PAGE_OFFSET)
+ return false;
+ x -= PAGE_OFFSET;
+ if (!phys_addr_valid(x))
+ return false;
+ }
+
+ return pfn_valid(x >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#else
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+ /* VMALLOC_* aren't constants */
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
+ return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+ if (x < PAGE_OFFSET)
+ return false;
+ if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+ return false;
+ if (x >= FIXADDR_START)
+ return false;
+ return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
new file mode 100644
index 00000000000..a3cd5a0c97b
--- /dev/null
+++ b/arch/x86/mm/physaddr.h
@@ -0,0 +1,10 @@
+#include <asm/processor.h>
+
+static inline int phys_addr_valid(resource_size_t addr)
+{
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+ return 1;
+#endif
+}
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
new file mode 100644
index 00000000000..513d8ed5d2e
--- /dev/null
+++ b/arch/x86/mm/setup_nx.c
@@ -0,0 +1,69 @@
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on Enable
+ * off Disable
+ */
+static int __init noexec_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ disable_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ disable_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+void __init set_nx(void)
+{
+ unsigned int v[4], l, h;
+
+ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+ if ((v[3] & (1 << 20)) && !disable_nx) {
+ rdmsr(MSR_EFER, l, h);
+ l |= EFER_NX;
+ wrmsr(MSR_EFER, l, h);
+ nx_enabled = 1;
+ __supported_pte_mask |= _PAGE_NX;
+ }
+ }
+}
+#else
+void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || disable_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index dbb5381f7b3..9d7ce96e5a5 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
- printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
pxm, apic_id, node);
}
@@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
- printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
pxm, apic_id, node);
}
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df..8565d944f7c 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
/*
* Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/io.h>
#include <linux/mmiotrace.h>
-#define MODULE_NAME "testmmiotrace"
-
static unsigned long mmio_address;
module_param(mmio_address, ulong, 0);
MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
static void do_write_test(void __iomem *p)
{
unsigned int i;
- pr_info(MODULE_NAME ": write test.\n");
+ pr_info("write test.\n");
mmiotrace_printk("Write test.\n");
for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
{
unsigned int i;
unsigned errs[3] = { 0 };
- pr_info(MODULE_NAME ": read test.\n");
+ pr_info("read test.\n");
mmiotrace_printk("Read test.\n");
for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
static void do_read_far_test(void __iomem *p)
{
- pr_info(MODULE_NAME ": read far test.\n");
+ pr_info("read far test.\n");
mmiotrace_printk("Read far test.\n");
ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
{
void __iomem *p = ioremap_nocache(mmio_address, size);
if (!p) {
- pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
+ pr_err("could not ioremap, aborting.\n");
return;
}
mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
if (mmio_address == 0) {
- pr_err(MODULE_NAME ": you have to use the module argument "
- "mmio_address.\n");
- pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
- " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+ pr_err("you have to use the module argument mmio_address.\n");
+ pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
return -ENXIO;
}
- pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
- "address space, and writing 16 kB of rubbish in there.\n",
- size >> 10, mmio_address);
+ pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
+ "and writing 16 kB of rubbish in there.\n",
+ size >> 10, mmio_address);
do_test(size);
- pr_info(MODULE_NAME ": All done.\n");
+ pr_info("All done.\n");
return 0;
}
static void __exit cleanup(void)
{
- pr_debug(MODULE_NAME ": unloaded.\n");
+ pr_debug("unloaded.\n");
}
module_init(init);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c814e144a3f..36fe08eeb5c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -59,7 +59,8 @@ void leave_mm(int cpu)
{
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
+ cpumask_clear_cpu(cpu,
+ mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -234,8 +235,8 @@ void flush_tlb_current_task(void)
preempt_disable();
local_flush_tlb();
- if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
preempt_enable();
}
@@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm)
else
leave_mm(smp_processor_id());
}
- if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
preempt_enable();
}
@@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
leave_mm(smp_processor_id());
}
- if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(&mm->cpu_vm_mask, mm, va);
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, va);
preempt_enable();
}
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 4899215999d..8eb05878554 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void)
if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
current_cpu_data.x86_model == 15) {
eax.split.version_id = 2;
- eax.split.num_counters = 2;
+ eax.split.num_events = 2;
eax.split.bit_width = 40;
}
- num_counters = eax.split.num_counters;
+ num_counters = eax.split.num_events;
op_arch_perfmon_spec.num_counters = num_counters;
op_arch_perfmon_spec.num_controls = num_counters;
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index b83776180c7..7b8e75d1608 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -13,7 +13,7 @@
#define OP_X86_MODEL_H
#include <asm/types.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
struct op_msr {
unsigned long addr;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 3ffa10df20b..572ee9782f2 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -15,63 +15,6 @@
* also get peer root bus resource for io,mmio
*/
-#ifdef CONFIG_NUMA
-
-#define BUS_NR 256
-
-#ifdef CONFIG_X86_64
-
-static int mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node = -1;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return node;
-
- node = mp_bus_to_node[busnum];
-
- /*
- * let numa_node_id to decide it later in dma_alloc_pages
- * if there is no ram on that node
- */
- if (node != -1 && !node_online(node))
- node = -1;
-
- return node;
-}
-
-#else /* CONFIG_X86_32 */
-
-static unsigned char mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
- if (busnum >= 0 && busnum < BUS_NR)
- mp_bus_to_node[busnum] = (unsigned char) node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
- int node;
-
- if (busnum < 0 || busnum > (BUS_NR - 1))
- return 0;
- node = mp_bus_to_node[busnum];
- return node;
-}
-
-#endif /* CONFIG_X86_32 */
-
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_X86_64
/*
@@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void)
u64 val;
u32 address;
-#ifdef CONFIG_NUMA
- for (i = 0; i < BUS_NR; i++)
- mp_bus_to_node[i] = -1;
-#endif
-
if (!early_pci_allowed())
return -1;
@@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void)
node = (reg >> 4) & 0x07;
#ifdef CONFIG_NUMA
for (j = min_bus; j <= max_bus; j++)
- mp_bus_to_node[j] = (unsigned char) node;
+ set_mp_bus_to_node(j, node);
#endif
link = (reg >> 8) & 0x03;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2202b6257b8..1331fcf2614 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
{
return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
}
+
+/*
+ * NUMA info for PCI busses
+ *
+ * Early arch code is responsible for filling in reasonable values here.
+ * A node id of "-1" means "use current node". In other words, if a bus
+ * has a -1 node id, it's not tightly coupled to any particular chunk
+ * of memory (as is the case on some Nehalem systems).
+ */
+#ifdef CONFIG_NUMA
+
+#define BUS_NR 256
+
+#ifdef CONFIG_X86_64
+
+static int mp_bus_to_node[BUS_NR] = {
+ [0 ... BUS_NR - 1] = -1
+};
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node = -1;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return node;
+
+ node = mp_bus_to_node[busnum];
+
+ /*
+ * let numa_node_id to decide it later in dma_alloc_pages
+ * if there is no ram on that node
+ */
+ if (node != -1 && !node_online(node))
+ node = -1;
+
+ return node;
+}
+
+#else /* CONFIG_X86_32 */
+
+static int mp_bus_to_node[BUS_NR] = {
+ [0 ... BUS_NR - 1] = -1
+};
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+ if (busnum >= 0 && busnum < BUS_NR)
+ mp_bus_to_node[busnum] = (unsigned char) node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+ int node;
+
+ if (busnum < 0 || busnum > (BUS_NR - 1))
+ return 0;
+ node = mp_bus_to_node[busnum];
+ return node;
+}
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* CONFIG_NUMA */
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 52e62e57fed..b22d13b0c71 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -266,7 +266,7 @@ void pcibios_set_master(struct pci_dev *dev)
pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
}
-static struct vm_operations_struct pci_mmap_ops = {
+static const struct vm_operations_struct pci_mmap_ops = {
.access = generic_access_phys,
};
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 712443ec6d4..602c172d3bd 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -13,10 +13,14 @@
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/acpi.h>
+#include <linux/sfi_acpi.h>
#include <linux/bitmap.h>
#include <linux/sort.h>
#include <asm/e820.h>
#include <asm/pci_x86.h>
+#include <asm/acpi.h>
+
+#define PREFIX "PCI: "
/* aperture is up to 256MB but BIOS may reserve less */
#define MMCONFIG_APER_MIN (2 * 1024*1024)
@@ -491,7 +495,7 @@ static void __init pci_mmcfg_reject_broken(int early)
(unsigned int)cfg->start_bus_number,
(unsigned int)cfg->end_bus_number);
- if (!early)
+ if (!early && !acpi_disabled)
valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
if (valid)
@@ -606,7 +610,7 @@ static void __init __pci_mmcfg_init(int early)
}
if (!known_bridge)
- acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+ acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
pci_mmcfg_reject_broken(early);
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 8b2d561046a..f10a7e94a84 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -11,9 +11,9 @@
#include <linux/pci.h>
#include <linux/init.h>
-#include <linux/acpi.h>
#include <asm/e820.h>
#include <asm/pci_x86.h>
+#include <acpi/acpi.h>
/* Assume systems with more busses have correct MCFG */
#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index b3d20b9cac6..0a979f3e5b8 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,6 +18,7 @@
#include <asm/mce.h>
#include <asm/xcr.h>
#include <asm/suspend.h>
+#include <asm/debugreg.h>
#ifdef CONFIG_X86_32
static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
#endif
load_TR_desc(); /* This does ltr */
load_LDT(&current->active_mm->context); /* This does lldt */
-
- /*
- * Now maybe reload the debug registers
- */
- if (current->thread.debugreg7) {
-#ifdef CONFIG_X86_32
- set_debugreg(current->thread.debugreg0, 0);
- set_debugreg(current->thread.debugreg1, 1);
- set_debugreg(current->thread.debugreg2, 2);
- set_debugreg(current->thread.debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(current->thread.debugreg6, 6);
- set_debugreg(current->thread.debugreg7, 7);
-#else
- /* CONFIG_X86_64 */
- loaddebug(&current->thread, 0);
- loaddebug(&current->thread, 1);
- loaddebug(&current->thread, 2);
- loaddebug(&current->thread, 3);
- /* no 4 and 5 */
- loaddebug(&current->thread, 6);
- loaddebug(&current->thread, 7);
-#endif
- }
-
}
/**
@@ -242,11 +218,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
fix_processor_context();
do_fpu_end();
- mtrr_ap_init();
-
-#ifdef CONFIG_X86_OLD_MCE
- mcheck_init(&boot_cpu_data);
-#endif
+ mtrr_bp_restore();
}
/* Needed by apm.c */
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644
index 00000000000..f8208267733
--- /dev/null
+++ b/arch/x86/tools/Makefile
@@ -0,0 +1,31 @@
+PHONY += posttest
+
+ifeq ($(KBUILD_VERBOSE),1)
+ posttest_verbose = -v
+else
+ posttest_verbose =
+endif
+
+ifeq ($(CONFIG_64BIT),y)
+ posttest_64bit = -y
+else
+ posttest_64bit = -n
+endif
+
+distill_awk = $(srctree)/arch/x86/tools/distill.awk
+chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
+
+quiet_cmd_posttest = TEST $@
+ cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
+
+posttest: $(obj)/test_get_len vmlinux
+ $(call cmd,posttest)
+
+hostprogs-y := test_get_len
+
+# -I needed for generated C source and C source which in the kernel tree.
+HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+
+# Dependencies are also needed.
+$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644
index 00000000000..0d13cd9fdcf
--- /dev/null
+++ b/arch/x86/tools/chkobjdump.awk
@@ -0,0 +1,23 @@
+# GNU objdump version checker
+#
+# Usage:
+# objdump -v | awk -f chkobjdump.awk
+BEGIN {
+ # objdump version 2.19 or later is OK for the test.
+ od_ver = 2;
+ od_sver = 19;
+}
+
+/^GNU/ {
+ split($4, ver, ".");
+ if (ver[1] > od_ver ||
+ (ver[1] == od_ver && ver[2] >= od_sver)) {
+ exit 1;
+ } else {
+ printf("Warning: objdump version %s is older than %d.%d\n",
+ $4, od_ver, od_sver);
+ print("Warning: Skipping posttest.");
+ # Logic is inverted, because we just skip test without error.
+ exit 0;
+ }
+}
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644
index 00000000000..c13c0ee48ab
--- /dev/null
+++ b/arch/x86/tools/distill.awk
@@ -0,0 +1,47 @@
+#!/bin/awk -f
+# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+# Distills the disassembly as follows:
+# - Removes all lines except the disassembled instructions.
+# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
+# into a single line.
+# - Remove bad(or prefix only) instructions
+
+BEGIN {
+ prev_addr = ""
+ prev_hex = ""
+ prev_mnemonic = ""
+ bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+ fwait_expr = "^9b "
+ fwait_str="9b\tfwait"
+}
+
+/^ *[0-9a-f]+ <[^>]*>:/ {
+ # Symbol entry
+ printf("%s%s\n", $2, $1)
+}
+
+/^ *[0-9a-f]+:/ {
+ if (split($0, field, "\t") < 3) {
+ # This is a continuation of the same insn.
+ prev_hex = prev_hex field[2]
+ } else {
+ # Skip bad instructions
+ if (match(prev_mnemonic, bad_expr))
+ prev_addr = ""
+ # Split fwait from other f* instructions
+ if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
+ printf "%s\t%s\n", prev_addr, fwait_str
+ sub(fwait_expr, "", prev_hex)
+ }
+ if (prev_addr != "")
+ printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+ prev_addr = field[1]
+ prev_hex = field[2]
+ prev_mnemonic = field[3]
+ }
+}
+
+END {
+ if (prev_addr != "")
+ printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644
index 00000000000..e34e92a28eb
--- /dev/null
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -0,0 +1,380 @@
+#!/bin/awk -f
+# gen-insn-attr-x86.awk: Instruction attribute table generator
+# Written by Masami Hiramatsu <mhiramat@redhat.com>
+#
+# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
+
+# Awk implementation sanity check
+function check_awk_implement() {
+ if (!match("abc", "[[:lower:]]+"))
+ return "Your awk doesn't support charactor-class."
+ if (sprintf("%x", 0) != "0")
+ return "Your awk has a printf-format problem."
+ return ""
+}
+
+# Clear working vars
+function clear_vars() {
+ delete table
+ delete lptable2
+ delete lptable1
+ delete lptable3
+ eid = -1 # escape id
+ gid = -1 # group id
+ aid = -1 # AVX id
+ tname = ""
+}
+
+BEGIN {
+ # Implementation error checking
+ awkchecked = check_awk_implement()
+ if (awkchecked != "") {
+ print "Error: " awkchecked > "/dev/stderr"
+ print "Please try to use gawk." > "/dev/stderr"
+ exit 1
+ }
+
+ # Setup generating tables
+ print "/* x86 opcode map generated from x86-opcode-map.txt */"
+ print "/* Do not change this code. */\n"
+ ggid = 1
+ geid = 1
+ gaid = 0
+ delete etable
+ delete gtable
+ delete atable
+
+ opnd_expr = "^[[:alpha:]/]"
+ ext_expr = "^\\("
+ sep_expr = "^\\|$"
+ group_expr = "^Grp[[:alnum:]]+"
+
+ imm_expr = "^[IJAO][[:lower:]]"
+ imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+ imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+ imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
+ imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
+ imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
+ imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
+ imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+ imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+ imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
+ imm_flag["Ob"] = "INAT_MOFFSET"
+ imm_flag["Ov"] = "INAT_MOFFSET"
+
+ modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])"
+ force64_expr = "\\([df]64\\)"
+ rex_expr = "^REX(\\.[XRWB]+)*"
+ fpu_expr = "^ESC" # TODO
+
+ lprefix1_expr = "\\(66\\)"
+ lprefix2_expr = "\\(F3\\)"
+ lprefix3_expr = "\\(F2\\)"
+ max_lprefix = 4
+
+ vexok_expr = "\\(VEX\\)"
+ vexonly_expr = "\\(oVEX\\)"
+
+ prefix_expr = "\\(Prefix\\)"
+ prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
+ prefix_num["REPNE"] = "INAT_PFX_REPNE"
+ prefix_num["REP/REPE"] = "INAT_PFX_REPE"
+ prefix_num["LOCK"] = "INAT_PFX_LOCK"
+ prefix_num["SEG=CS"] = "INAT_PFX_CS"
+ prefix_num["SEG=DS"] = "INAT_PFX_DS"
+ prefix_num["SEG=ES"] = "INAT_PFX_ES"
+ prefix_num["SEG=FS"] = "INAT_PFX_FS"
+ prefix_num["SEG=GS"] = "INAT_PFX_GS"
+ prefix_num["SEG=SS"] = "INAT_PFX_SS"
+ prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
+ prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
+ prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
+
+ clear_vars()
+}
+
+function semantic_error(msg) {
+ print "Semantic error at " NR ": " msg > "/dev/stderr"
+ exit 1
+}
+
+function debug(msg) {
+ print "DEBUG: " msg
+}
+
+function array_size(arr, i,c) {
+ c = 0
+ for (i in arr)
+ c++
+ return c
+}
+
+/^Table:/ {
+ print "/* " $0 " */"
+ if (tname != "")
+ semantic_error("Hit Table: before EndTable:.");
+}
+
+/^Referrer:/ {
+ if (NF != 1) {
+ # escape opcode table
+ ref = ""
+ for (i = 2; i <= NF; i++)
+ ref = ref $i
+ eid = escape[ref]
+ tname = sprintf("inat_escape_table_%d", eid)
+ }
+}
+
+/^AVXcode:/ {
+ if (NF != 1) {
+ # AVX/escape opcode table
+ aid = $2
+ if (gaid <= aid)
+ gaid = aid + 1
+ if (tname == "") # AVX only opcode table
+ tname = sprintf("inat_avx_table_%d", $2)
+ }
+ if (aid == -1 && eid == -1) # primary opcode table
+ tname = "inat_primary_table"
+}
+
+/^GrpTable:/ {
+ print "/* " $0 " */"
+ if (!($2 in group))
+ semantic_error("No group: " $2 )
+ gid = group[$2]
+ tname = "inat_group_table_" gid
+}
+
+function print_table(tbl,name,fmt,n)
+{
+ print "const insn_attr_t " name " = {"
+ for (i = 0; i < n; i++) {
+ id = sprintf(fmt, i)
+ if (tbl[id])
+ print " [" id "] = " tbl[id] ","
+ }
+ print "};"
+}
+
+/^EndTable/ {
+ if (gid != -1) {
+ # print group tables
+ if (array_size(table) != 0) {
+ print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,0] = tname
+ }
+ if (array_size(lptable1) != 0) {
+ print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,1] = tname "_1"
+ }
+ if (array_size(lptable2) != 0) {
+ print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,2] = tname "_2"
+ }
+ if (array_size(lptable3) != 0) {
+ print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,3] = tname "_3"
+ }
+ } else {
+ # print primary/escaped tables
+ if (array_size(table) != 0) {
+ print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,0] = tname
+ if (aid >= 0)
+ atable[aid,0] = tname
+ }
+ if (array_size(lptable1) != 0) {
+ print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,1] = tname "_1"
+ if (aid >= 0)
+ atable[aid,1] = tname "_1"
+ }
+ if (array_size(lptable2) != 0) {
+ print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,2] = tname "_2"
+ if (aid >= 0)
+ atable[aid,2] = tname "_2"
+ }
+ if (array_size(lptable3) != 0) {
+ print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,3] = tname "_3"
+ if (aid >= 0)
+ atable[aid,3] = tname "_3"
+ }
+ }
+ print ""
+ clear_vars()
+}
+
+function add_flags(old,new) {
+ if (old && new)
+ return old " | " new
+ else if (old)
+ return old
+ else
+ return new
+}
+
+# convert operands to flags.
+function convert_operands(opnd, i,imm,mod)
+{
+ imm = null
+ mod = null
+ for (i in opnd) {
+ i = opnd[i]
+ if (match(i, imm_expr) == 1) {
+ if (!imm_flag[i])
+ semantic_error("Unknown imm opnd: " i)
+ if (imm) {
+ if (i != "Ib")
+ semantic_error("Second IMM error")
+ imm = add_flags(imm, "INAT_SCNDIMM")
+ } else
+ imm = imm_flag[i]
+ } else if (match(i, modrm_expr))
+ mod = "INAT_MODRM"
+ }
+ return add_flags(imm, mod)
+}
+
+/^[0-9a-f]+\:/ {
+ if (NR == 1)
+ next
+ # get index
+ idx = "0x" substr($1, 1, index($1,":") - 1)
+ if (idx in table)
+ semantic_error("Redefine " idx " in " tname)
+
+ # check if escaped opcode
+ if ("escape" == $2) {
+ if ($3 != "#")
+ semantic_error("No escaped name")
+ ref = ""
+ for (i = 4; i <= NF; i++)
+ ref = ref $i
+ if (ref in escape)
+ semantic_error("Redefine escape (" ref ")")
+ escape[ref] = geid
+ geid++
+ table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
+ next
+ }
+
+ variant = null
+ # converts
+ i = 2
+ while (i <= NF) {
+ opcode = $(i++)
+ delete opnds
+ ext = null
+ flags = null
+ opnd = null
+ # parse one opcode
+ if (match($i, opnd_expr)) {
+ opnd = $i
+ split($(i++), opnds, ",")
+ flags = convert_operands(opnds)
+ }
+ if (match($i, ext_expr))
+ ext = $(i++)
+ if (match($i, sep_expr))
+ i++
+ else if (i < NF)
+ semantic_error($i " is not a separator")
+
+ # check if group opcode
+ if (match(opcode, group_expr)) {
+ if (!(opcode in group)) {
+ group[opcode] = ggid
+ ggid++
+ }
+ flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
+ }
+ # check force(or default) 64bit
+ if (match(ext, force64_expr))
+ flags = add_flags(flags, "INAT_FORCE64")
+
+ # check REX prefix
+ if (match(opcode, rex_expr))
+ flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
+
+ # check coprocessor escape : TODO
+ if (match(opcode, fpu_expr))
+ flags = add_flags(flags, "INAT_MODRM")
+
+ # check VEX only code
+ if (match(ext, vexonly_expr))
+ flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
+
+ # check VEX only code
+ if (match(ext, vexok_expr))
+ flags = add_flags(flags, "INAT_VEXOK")
+
+ # check prefixes
+ if (match(ext, prefix_expr)) {
+ if (!prefix_num[opcode])
+ semantic_error("Unknown prefix: " opcode)
+ flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
+ }
+ if (length(flags) == 0)
+ continue
+ # check if last prefix
+ if (match(ext, lprefix1_expr)) {
+ lptable1[idx] = add_flags(lptable1[idx],flags)
+ variant = "INAT_VARIANT"
+ } else if (match(ext, lprefix2_expr)) {
+ lptable2[idx] = add_flags(lptable2[idx],flags)
+ variant = "INAT_VARIANT"
+ } else if (match(ext, lprefix3_expr)) {
+ lptable3[idx] = add_flags(lptable3[idx],flags)
+ variant = "INAT_VARIANT"
+ } else {
+ table[idx] = add_flags(table[idx],flags)
+ }
+ }
+ if (variant)
+ table[idx] = add_flags(table[idx],variant)
+}
+
+END {
+ if (awkchecked != "")
+ exit 1
+ # print escape opcode map's array
+ print "/* Escape opcode map array */"
+ print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < geid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (etable[i,j])
+ print " ["i"]["j"] = "etable[i,j]","
+ print "};\n"
+ # print group opcode map's array
+ print "/* Group opcode map array */"
+ print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < ggid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (gtable[i,j])
+ print " ["i"]["j"] = "gtable[i,j]","
+ print "};\n"
+ # print AVX opcode map's array
+ print "/* AVX opcode map array */"
+ print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < gaid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (atable[i,j])
+ print " ["i"]["j"] = "atable[i,j]","
+ print "};"
+}
+
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
new file mode 100644
index 00000000000..d8214dc03fa
--- /dev/null
+++ b/arch/x86/tools/test_get_len.c
@@ -0,0 +1,173 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#define unlikely(cond) (cond)
+
+#include <asm/insn.h>
+#include <inat.c>
+#include <insn.c>
+
+/*
+ * Test of instruction analysis in general and insn_get_length() in
+ * particular. See if insn_get_length() and the disassembler agree
+ * on the length of each instruction in an elf disassembly.
+ *
+ * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+ */
+
+const char *prog;
+static int verbose;
+static int x86_64;
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
+ " %s [-y|-n] [-v] \n", prog);
+ fprintf(stderr, "\t-y 64bit mode\n");
+ fprintf(stderr, "\t-n 32bit mode\n");
+ fprintf(stderr, "\t-v verbose mode\n");
+ exit(1);
+}
+
+static void malformed_line(const char *line, int line_nr)
+{
+ fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
+ exit(3);
+}
+
+static void dump_field(FILE *fp, const char *name, const char *indent,
+ struct insn_field *field)
+{
+ fprintf(fp, "%s.%s = {\n", indent, name);
+ fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
+ indent, field->value, field->bytes[0], field->bytes[1],
+ field->bytes[2], field->bytes[3]);
+ fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
+ field->got, field->nbytes);
+}
+
+static void dump_insn(FILE *fp, struct insn *insn)
+{
+ fprintf(fp, "Instruction = { \n");
+ dump_field(fp, "prefixes", "\t", &insn->prefixes);
+ dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
+ dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
+ dump_field(fp, "opcode", "\t", &insn->opcode);
+ dump_field(fp, "modrm", "\t", &insn->modrm);
+ dump_field(fp, "sib", "\t", &insn->sib);
+ dump_field(fp, "displacement", "\t", &insn->displacement);
+ dump_field(fp, "immediate1", "\t", &insn->immediate1);
+ dump_field(fp, "immediate2", "\t", &insn->immediate2);
+ fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
+ insn->attr, insn->opnd_bytes, insn->addr_bytes);
+ fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
+ insn->length, insn->x86_64, insn->kaddr);
+}
+
+static void parse_args(int argc, char **argv)
+{
+ int c;
+ prog = argv[0];
+ while ((c = getopt(argc, argv, "ynv")) != -1) {
+ switch (c) {
+ case 'y':
+ x86_64 = 1;
+ break;
+ case 'n':
+ x86_64 = 0;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ default:
+ usage();
+ }
+ }
+}
+
+#define BUFSIZE 256
+
+int main(int argc, char **argv)
+{
+ char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
+ unsigned char insn_buf[16];
+ struct insn insn;
+ int insns = 0, c;
+ int warnings = 0;
+
+ parse_args(argc, argv);
+
+ while (fgets(line, BUFSIZE, stdin)) {
+ char copy[BUFSIZE], *s, *tab1, *tab2;
+ int nb = 0;
+ unsigned int b;
+
+ if (line[0] == '<') {
+ /* Symbol line */
+ strcpy(sym, line);
+ continue;
+ }
+
+ insns++;
+ memset(insn_buf, 0, 16);
+ strcpy(copy, line);
+ tab1 = strchr(copy, '\t');
+ if (!tab1)
+ malformed_line(line, insns);
+ s = tab1 + 1;
+ s += strspn(s, " ");
+ tab2 = strchr(s, '\t');
+ if (!tab2)
+ malformed_line(line, insns);
+ *tab2 = '\0'; /* Characters beyond tab2 aren't examined */
+ while (s < tab2) {
+ if (sscanf(s, "%x", &b) == 1) {
+ insn_buf[nb++] = (unsigned char) b;
+ s += 3;
+ } else
+ break;
+ }
+ /* Decode an instruction */
+ insn_init(&insn, insn_buf, x86_64);
+ insn_get_length(&insn);
+ if (insn.length != nb) {
+ warnings++;
+ fprintf(stderr, "Warning: %s found difference at %s\n",
+ prog, sym);
+ fprintf(stderr, "Warning: %s", line);
+ fprintf(stderr, "Warning: objdump says %d bytes, but "
+ "insn_get_length() says %d\n", nb,
+ insn.length);
+ if (verbose)
+ dump_insn(stderr, &insn);
+ }
+ }
+ if (warnings)
+ fprintf(stderr, "Warning: decoded and checked %d"
+ " instructions with %d warnings\n", insns, warnings);
+ else
+ fprintf(stderr, "Succeed: decoded and checked %d"
+ " instructions\n", insns);
+ return 0;
+}
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 88112b49f02..6b4ffedb93c 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO $@
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
-VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
GCOV_PROFILE := n
#
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6a40b78b46a..ee55754cc3c 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts)
return 0;
}
+notrace static noinline int do_realtime_coarse(struct timespec *ts)
+{
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ ts->tv_sec = gtod->wall_time_coarse.tv_sec;
+ ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ return 0;
+}
+
+notrace static noinline int do_monotonic_coarse(struct timespec *ts)
+{
+ unsigned long seq, ns, secs;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ secs = gtod->wall_time_coarse.tv_sec;
+ ns = gtod->wall_time_coarse.tv_nsec;
+ secs += gtod->wall_to_monotonic.tv_sec;
+ ns += gtod->wall_to_monotonic.tv_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ vset_normalized_timespec(ts, secs, ns);
+ return 0;
+}
+
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
- if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+ if (likely(gtod->sysctl_enabled))
switch (clock) {
case CLOCK_REALTIME:
- return do_realtime(ts);
+ if (likely(gtod->clock.vread))
+ return do_realtime(ts);
+ break;
case CLOCK_MONOTONIC:
- return do_monotonic(ts);
+ if (likely(gtod->clock.vread))
+ return do_monotonic(ts);
+ break;
+ case CLOCK_REALTIME_COARSE:
+ return do_realtime_coarse(ts);
+ case CLOCK_MONOTONIC_COARSE:
+ return do_monotonic_coarse(ts);
}
return vdso_fallback_gettime(clock, ts);
}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 7410640db17..3bb4fc21f4f 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -8,6 +8,7 @@ endif
# Make sure early boot has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_enlighten.o := $(nostackp)
+CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
@@ -16,3 +17,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index b53225d2cac..e133ce25e29 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -100,7 +100,7 @@ static int xen_array_release(struct inode *inode, struct file *file)
return 0;
}
-static struct file_operations u32_array_fops = {
+static const struct file_operations u32_array_fops = {
.owner = THIS_MODULE,
.open = u32_array_open,
.release= xen_array_release,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b62ccb840cf..dfbf70e6586 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -51,6 +51,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
+#include <asm/stackprotector.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -177,6 +178,7 @@ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
+ unsigned maskebx = ~0;
unsigned maskecx = ~0;
unsigned maskedx = ~0;
@@ -184,9 +186,16 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
- if (*ax == 1) {
+ switch (*ax) {
+ case 1:
maskecx = cpuid_leaf1_ecx_mask;
maskedx = cpuid_leaf1_edx_mask;
+ break;
+
+ case 0xb:
+ /* Suppress extended topology stuff */
+ maskebx = 0;
+ break;
}
asm(XEN_EMULATE_PREFIX "cpuid"
@@ -196,6 +205,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
"=d" (*dx)
: "0" (*ax), "2" (*cx));
+ *bx &= maskebx;
*cx &= maskecx;
*dx &= maskedx;
}
@@ -330,18 +340,28 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
unsigned long frames[pages];
int f;
- /* A GDT can be up to 64k in size, which corresponds to 8192
- 8-byte entries, or 16 4k pages.. */
+ /*
+ * A GDT can be up to 64k in size, which corresponds to 8192
+ * 8-byte entries, or 16 4k pages..
+ */
BUG_ON(size > 65536);
BUG_ON(va & ~PAGE_MASK);
for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
int level;
- pte_t *ptep = lookup_address(va, &level);
+ pte_t *ptep;
unsigned long pfn, mfn;
void *virt;
+ /*
+ * The GDT is per-cpu and is in the percpu data area.
+ * That can be virtually mapped, so we need to do a
+ * page-walk to get the underlying MFN for the
+ * hypercall. The page can also be in the kernel's
+ * linear range, so we need to RO that mapping too.
+ */
+ ptep = lookup_address(va, &level);
BUG_ON(ptep == NULL);
pfn = pte_pfn(*ptep);
@@ -358,6 +378,44 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
BUG();
}
+/*
+ * load_gdt for early boot, when the gdt is only mapped once
+ */
+static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+{
+ unsigned long va = dtr->address;
+ unsigned int size = dtr->size + 1;
+ unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ unsigned long frames[pages];
+ int f;
+
+ /*
+ * A GDT can be up to 64k in size, which corresponds to 8192
+ * 8-byte entries, or 16 4k pages..
+ */
+
+ BUG_ON(size > 65536);
+ BUG_ON(va & ~PAGE_MASK);
+
+ for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+ pte_t pte;
+ unsigned long pfn, mfn;
+
+ pfn = virt_to_pfn(va);
+ mfn = pfn_to_mfn(pfn);
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_RO);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+ BUG();
+
+ frames[f] = mfn;
+ }
+
+ if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ BUG();
+}
+
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
@@ -581,6 +639,29 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
preempt_enable();
}
+/*
+ * Version of write_gdt_entry for use at early boot-time needed to
+ * update an entry as simply as possible.
+ */
+static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+ const void *desc, int type)
+{
+ switch (type) {
+ case DESC_LDT:
+ case DESC_TSS:
+ /* ignore */
+ break;
+
+ default: {
+ xmaddr_t maddr = virt_to_machine(&dt[entry]);
+
+ if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
+ dt[entry] = *(struct desc_struct *)desc;
+ }
+
+ }
+}
+
static void xen_load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -840,19 +921,9 @@ static const struct pv_info xen_info __initdata = {
static const struct pv_init_ops xen_init_ops __initdata = {
.patch = xen_patch,
-
- .banner = xen_banner,
- .memory_setup = xen_memory_setup,
- .arch_setup = xen_arch_setup,
- .post_allocator_init = xen_post_allocator_init,
};
static const struct pv_time_ops xen_time_ops __initdata = {
- .time_init = xen_time_init,
-
- .set_wallclock = xen_set_wallclock,
- .get_wallclock = xen_get_wallclock,
- .get_tsc_khz = xen_tsc_khz,
.sched_clock = xen_sched_clock,
};
@@ -918,8 +989,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
static const struct pv_apic_ops xen_apic_ops __initdata = {
#ifdef CONFIG_X86_LOCAL_APIC
- .setup_boot_clock = paravirt_nop,
- .setup_secondary_clock = paravirt_nop,
.startup_ipi_hook = paravirt_nop,
#endif
};
@@ -965,6 +1034,23 @@ static const struct machine_ops __initdata xen_machine_ops = {
.emergency_restart = xen_emergency_restart,
};
+/*
+ * Set up the GDT and segment registers for -fstack-protector. Until
+ * we do this, we have to be careful not to call any stack-protected
+ * function, which is most of the kernel.
+ */
+static void __init xen_setup_stackprotector(void)
+{
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
+ pv_cpu_ops.load_gdt = xen_load_gdt_boot;
+
+ setup_stack_canary_segment(0);
+ switch_to_new_gdt(0);
+
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
+ pv_cpu_ops.load_gdt = xen_load_gdt;
+}
+
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -981,16 +1067,49 @@ asmlinkage void __init xen_start_kernel(void)
pv_time_ops = xen_time_ops;
pv_cpu_ops = xen_cpu_ops;
pv_apic_ops = xen_apic_ops;
- pv_mmu_ops = xen_mmu_ops;
-#ifdef CONFIG_X86_64
+ x86_init.resources.memory_setup = xen_memory_setup;
+ x86_init.oem.arch_setup = xen_arch_setup;
+ x86_init.oem.banner = xen_banner;
+
+ x86_init.timers.timer_init = xen_time_init;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
+ x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+
/*
- * Setup percpu state. We only need to do this for 64-bit
- * because 32-bit already has %fs set properly.
+ * Set up some pagetable state before starting to set any ptes.
*/
- load_percpu_segment(0);
+
+ xen_init_mmu_ops();
+
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (!xen_initial_domain())
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
+ __supported_pte_mask |= _PAGE_IOMAP;
+
+#ifdef CONFIG_X86_64
+ /* Work out if we support NX */
+ check_efer();
#endif
+ xen_setup_features();
+
+ /* Get mfn list */
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ xen_build_dynamic_phys_to_machine();
+
+ /*
+ * Set up kernel GDT and segment registers, mainly so that
+ * -fstack-protector code can be executed.
+ */
+ xen_setup_stackprotector();
+
xen_init_irq_ops();
xen_init_cpuid_mask();
@@ -1001,8 +1120,6 @@ asmlinkage void __init xen_start_kernel(void)
set_xen_basic_apic_ops();
#endif
- xen_setup_features();
-
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1019,22 +1136,8 @@ asmlinkage void __init xen_start_kernel(void)
xen_smp_init();
- /* Get mfn list */
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- xen_build_dynamic_phys_to_machine();
-
pgd = (pgd_t *)xen_start_info->pt_base;
- /* Prevent unwanted bits from being set in PTEs. */
- __supported_pte_mask &= ~_PAGE_GLOBAL;
- if (!xen_initial_domain())
- __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
-#ifdef CONFIG_X86_64
- /* Work out if we support NX */
- check_efer();
-#endif
-
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index cfd17799bd6..9d30105a0c4 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -1,5 +1,7 @@
#include <linux/hardirq.h>
+#include <asm/x86_init.h>
+
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/vcpu.h>
@@ -112,8 +114,6 @@ static void xen_halt(void)
}
static const struct pv_irq_ops xen_irq_ops __initdata = {
- .init_IRQ = xen_init_IRQ,
-
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
@@ -129,4 +129,5 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
void __init xen_init_irq_ops()
{
pv_irq_ops = xen_irq_ops;
+ x86_init.irqs.intr_init = xen_init_IRQ;
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4ceb2858165..3bf7b1d250c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
/* Get the "official" set of cpus referring to our pagetable. */
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+ if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
continue;
smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
}
return;
}
- cpumask_copy(mask, &mm->cpu_vm_mask);
+ cpumask_copy(mask, mm_cpumask(mm));
/* It's possible that a vcpu may have a stale reference to our
cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1229,9 +1229,12 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
{
}
+static void xen_post_allocator_init(void);
+
static __init void xen_pagetable_setup_done(pgd_t *base)
{
xen_setup_shared_info();
+ xen_post_allocator_init();
}
static void xen_write_cr2(unsigned long cr2)
@@ -1841,7 +1844,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
-__init void xen_post_allocator_init(void)
+static __init void xen_post_allocator_init(void)
{
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
@@ -1875,10 +1878,7 @@ static void xen_leave_lazy_mmu(void)
preempt_enable();
}
-const struct pv_mmu_ops xen_mmu_ops __initdata = {
- .pagetable_setup_start = xen_pagetable_setup_start,
- .pagetable_setup_done = xen_pagetable_setup_done,
-
+static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.read_cr2 = xen_read_cr2,
.write_cr2 = xen_write_cr2,
@@ -1954,6 +1954,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
.set_fixmap = xen_set_fixmap,
};
+void __init xen_init_mmu_ops(void)
+{
+ x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
+ x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
+ pv_mmu_ops = xen_mmu_ops;
+}
#ifdef CONFIG_XEN_DEBUG_FS
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index da730262489..5fe6bc7f5ec 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -59,5 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
unsigned long xen_read_cr2_direct(void);
-extern const struct pv_mmu_ops xen_mmu_ops;
+extern void xen_init_mmu_ops(void);
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 429834ec168..fe03eeed7b4 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -236,6 +236,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
+ ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 5601506f2dd..36a5141108d 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -187,7 +187,6 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
struct xen_spinlock *prev;
int irq = __get_cpu_var(lock_kicker_irq);
int ret;
- unsigned long flags;
u64 start;
/* If kicker interrupts not initialized yet, just spin */
@@ -199,16 +198,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
/* announce we're spinning */
prev = spinning_lock(xl);
- flags = __raw_local_save_flags();
- if (irq_enable) {
- ADD_STATS(taken_slow_irqenable, 1);
- raw_local_irq_enable();
- }
-
ADD_STATS(taken_slow, 1);
ADD_STATS(taken_slow_nested, prev != NULL);
do {
+ unsigned long flags;
+
/* clear pending */
xen_clear_irq_pending(irq);
@@ -228,6 +223,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
goto out;
}
+ flags = __raw_local_save_flags();
+ if (irq_enable) {
+ ADD_STATS(taken_slow_irqenable, 1);
+ raw_local_irq_enable();
+ }
+
/*
* Block until irq becomes pending. If we're
* interrupted at this point (after the trylock but
@@ -238,13 +239,15 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
* pending.
*/
xen_poll_irq(irq);
+
+ raw_local_irq_restore(flags);
+
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
out:
- raw_local_irq_restore(flags);
unspinning_lock(xl, prev);
spin_time_accum_blocked(start);
@@ -323,8 +326,13 @@ static void xen_spin_unlock(struct raw_spinlock *lock)
smp_wmb(); /* make sure no writes get moved after unlock */
xl->lock = 0; /* release lock */
- /* make sure unlock happens before kick */
- barrier();
+ /*
+ * Make sure unlock happens before checking for waiting
+ * spinners. We need a strong barrier to enforce the
+ * write-read ordering to different memory locations, as the
+ * CPU makes no implied guarantees about their ordering.
+ */
+ mb();
if (unlikely(xl->spinners))
xen_spin_unlock_slow(xl);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 22494fd4c9b..355fa6b99c9 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,8 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
void xen_reserve_top(void);
-void xen_post_allocator_init(void);
-
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);