diff options
Diffstat (limited to 'arch')
34 files changed, 1251 insertions, 356 deletions
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ce0bce472455..7b4201294187 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1155,7 +1155,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) */ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) { - struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot); + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; @@ -1718,8 +1719,9 @@ out: } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { /* @@ -1733,7 +1735,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { hva_t hva = mem->userspace_addr; @@ -1838,7 +1840,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; } -void kvm_arch_memslots_updated(struct kvm *kvm) +void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) { } diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 4c25823563fe..e8c8d9d0c45f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -839,7 +839,7 @@ static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a8e660a44474..cd4c129ce743 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -198,15 +198,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { unsigned long npages = 0; @@ -968,6 +969,7 @@ out: /* Get (and clear) the dirty memory log for a memory slot. */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; unsigned long ga, ga_end; int is_dirty = 0; @@ -982,7 +984,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { - memslot = id_to_memslot(kvm->memslots, log->slot); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); ga = memslot->base_gfn << PAGE_SHIFT; ga_end = ga + (memslot->npages << PAGE_SHIFT); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 3536d12eb798..2aa79c864e91 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -430,7 +430,7 @@ static inline void note_hpte_modification(struct kvm *kvm, */ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) { - return rcu_dereference_raw_notrace(kvm->memslots); + return rcu_dereference_raw_notrace(kvm->memslots[0]); } extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a193a13cf08b..d91f65b28e32 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -698,7 +698,7 @@ struct kvm_vcpu_arch { static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index b8475daad884..c6ef05bd0765 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -182,10 +182,11 @@ extern int kvmppc_core_create_memslot(struct kvm *kvm, unsigned long npages); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region *mem); extern void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old); + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new); extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info); extern void kvmppc_core_flush_memslot(struct kvm *kvm, @@ -243,10 +244,11 @@ struct kvmppc_ops { void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot); int (*prepare_memory_region)(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region *mem); void (*commit_memory_region)(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old); + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new); int (*unmap_hva)(struct kvm *kvm, unsigned long hva); int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, unsigned long end); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 453a8a47a467..05ea8fc7f829 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -757,16 +757,17 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new) { - kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); + kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new); } int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1a4acf8bf4f4..dab68b7af3f2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -650,7 +650,7 @@ static void kvmppc_rmap_reset(struct kvm *kvm) int srcu_idx; srcu_idx = srcu_read_lock(&kvm->srcu); - slots = kvm->memslots; + slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { /* * This assumes it is acceptable to lose reference and diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 48d3c5d2ecc9..68d067ad4222 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1952,7 +1952,7 @@ static void post_guest_process(struct kvmppc_vcore *vc) */ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu, *vnext; int i; int srcu_idx; @@ -1982,7 +1982,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ if ((threads_per_core > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { vcpu->arch.ret = -EBUSY; kvmppc_remove_runnable(vc, vcpu); wake_up(&vcpu->arch.cpu_run); @@ -2320,6 +2321,7 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, struct kvm_dirty_log *log) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int r; unsigned long n; @@ -2330,7 +2332,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, if (log->slot >= KVM_USER_MEM_SLOTS) goto out; - memslot = id_to_memslot(kvm->memslots, log->slot); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -2373,16 +2376,18 @@ static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { return 0; } static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; if (npages && old->npages) { @@ -2392,7 +2397,8 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, * since the rmap array starts out as all zeroes, * i.e. no pages are dirty. */ - memslot = id_to_memslot(kvm->memslots, mem->slot); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, mem->slot); kvmppc_hv_get_dirty_log(kvm, memslot, NULL); } } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index f57383941d03..64891b081ad5 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1530,6 +1530,7 @@ out: static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, struct kvm_dirty_log *log) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; struct kvm_vcpu *vcpu; ulong ga, ga_end; @@ -1545,7 +1546,8 @@ static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { - memslot = id_to_memslot(kvm->memslots, log->slot); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); ga = memslot->base_gfn << PAGE_SHIFT; ga_end = ga + (memslot->npages << PAGE_SHIFT); @@ -1571,14 +1573,15 @@ static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { return 0; } static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new) { return; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3872ab31c80a..cc5842657161 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1784,14 +1784,15 @@ int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { return 0; } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new) { } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 8cd1f80fdc70..e5dde32fe71f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -595,18 +595,19 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { return kvmppc_core_prepare_memory_region(kvm, memslot, mem); } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { - kvmppc_core_commit_memory_region(kvm, mem, old); + kvmppc_core_commit_memory_region(kvm, mem, old, new); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 444c412307cb..3024acbe1f9d 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -636,7 +636,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d461f8a15c07..71530a43a728 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -36,6 +36,10 @@ #include "kvm-s390.h" #include "gaccess.h" +#define KMSG_COMPONENT "kvm-s390" +#undef pr_fmt +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #define CREATE_TRACE_POINTS #include "trace.h" #include "trace-s390.h" @@ -236,6 +240,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, { int r; unsigned long n; + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int is_dirty = 0; @@ -245,7 +250,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_USER_MEM_SLOTS) goto out; - memslot = id_to_memslot(kvm->memslots, log->slot); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -1417,6 +1423,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu) { atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); + exit_sie(vcpu); } void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu) @@ -1427,6 +1434,7 @@ void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu) static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu) { atomic_set_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20); + exit_sie(vcpu); } static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu) @@ -1450,7 +1458,6 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) { kvm_make_request(req, vcpu); kvm_s390_vcpu_request(vcpu); - exit_sie(vcpu); } static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) @@ -2087,7 +2094,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) { kvm_s390_vcpu_start(vcpu); } else if (is_vcpu_stopped(vcpu)) { - pr_err_ratelimited("kvm-s390: can't run stopped vcpu %d\n", + pr_err_ratelimited("can't run stopped vcpu %d\n", vcpu->vcpu_id); return -EINVAL; } @@ -2580,7 +2587,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { /* A few sanity checks. We can have memory slots which have to be @@ -2598,8 +2605,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { int rc; @@ -2618,7 +2626,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, mem->guest_phys_addr, mem->memory_size); if (rc) - printk(KERN_WARNING "kvm-s390: failed to commit memory region\n"); + pr_warn("failed to commit memory region\n"); return; } diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 57a9d94fe160..e16466ec473c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -193,6 +193,8 @@ struct x86_emulate_ops { int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); + u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); + void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); @@ -262,6 +264,11 @@ enum x86emul_mode { X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ }; +/* These match some of the HF_* flags defined in kvm_host.h */ +#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define X86EMUL_SMM_MASK (1 << 6) +#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7) + struct x86_emulate_ctxt { const struct x86_emulate_ops *ops; @@ -273,8 +280,8 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; + int emul_flags; - bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bbb8f4e7738a..8ca32cfbcbd8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -184,23 +184,12 @@ struct kvm_mmu_memory_cache { void *objects[KVM_NR_MEM_OBJS]; }; -/* - * kvm_mmu_page_role, below, is defined as: - * - * bits 0:3 - total guest paging levels (2-4, or zero for real mode) - * bits 4:7 - page table level for this shadow (1-4) - * bits 8:9 - page table quadrant for 2-level guests - * bit 16 - direct mapping of virtual to physical mapping at gfn - * used for real mode and two-dimensional paging - * bits 17:19 - common access permissions for all ptes in this shadow page - */ union kvm_mmu_page_role { unsigned word; struct { unsigned level:4; unsigned cr4_pae:1; unsigned quadrant:2; - unsigned pad_for_nice_hex_output:6; unsigned direct:1; unsigned access:3; unsigned invalid:1; @@ -208,6 +197,15 @@ union kvm_mmu_page_role { unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; + unsigned :8; + + /* + * This is left at the top of the word so that + * kvm_memslots_for_spte_role can extract it with a + * simple shift. While there is room, give it a whole + * byte so it is also faster to load it from memory. + */ + unsigned smm:8; }; }; @@ -368,6 +366,7 @@ struct kvm_vcpu_arch { int32_t apic_arb_prio; int mp_state; u64 ia32_misc_enable_msr; + u64 smbase; bool tpr_access_reporting; u64 ia32_xss; @@ -401,6 +400,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_header_cache; struct fpu guest_fpu; + bool eager_fpu; u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; @@ -470,6 +470,7 @@ struct kvm_vcpu_arch { atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ bool nmi_injected; /* Trying to inject an NMI this entry */ + bool smi_pending; /* SMI queued after currently running handler */ struct mtrr_state_type mtrr_state; u64 pat; @@ -708,6 +709,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + bool (*cpu_has_high_real_mode_segbase)(void); void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ @@ -720,7 +722,7 @@ struct kvm_x86_ops { void (*vcpu_put)(struct kvm_vcpu *vcpu); void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, @@ -747,6 +749,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -872,7 +875,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, @@ -883,7 +886,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); @@ -939,7 +942,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); +int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; @@ -968,7 +971,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); @@ -1113,6 +1116,14 @@ enum { #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define HF_SMM_MASK (1 << 6) +#define HF_SMM_INSIDE_NMI_MASK (1 << 7) + +#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +#define KVM_ADDRESS_SPACE_NUM 2 + +#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) +#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) /* * Hardware virtualization extension instructions may fault if a @@ -1183,4 +1194,9 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); void kvm_deliver_pmi(struct kvm_vcpu *vcpu); +int __x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem); +int x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 6167fd798188..655e07a48f6c 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -41,5 +41,6 @@ struct pvclock_wall_clock { #define PVCLOCK_TSC_STABLE_BIT (1 << 0) #define PVCLOCK_GUEST_STOPPED (1 << 1) +#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 2fec75e4b1e1..a4ae82eb82aa 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -106,6 +106,8 @@ struct kvm_ioapic_state { #define KVM_IRQCHIP_IOAPIC 2 #define KVM_NR_IRQCHIPS 3 +#define KVM_RUN_X86_SMM (1 << 0) + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ @@ -281,6 +283,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 +#define KVM_VCPUEVENT_VALID_SMM 0x00000008 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -309,7 +312,13 @@ struct kvm_vcpu_events { } nmi; __u32 sipi_vector; __u32 flags; - __u32 reserved[10]; + struct { + __u8 smm; + __u8 pending; + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; + __u32 reserved[9]; }; /* for KVM_GET/SET_DEBUGREGS */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9435620062df..cc34cece853e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -331,7 +331,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) apic_write(APIC_EOI, APIC_EOI_ACK); } -void kvm_guest_cpu_init(void) +static void kvm_guest_cpu_init(void) { if (!kvm_para_available()) return; @@ -655,7 +655,7 @@ static inline void spin_time_accum_blocked(u64 start) static struct dentry *d_spin_debug; static struct dentry *d_kvm_debug; -struct dentry *kvm_init_debugfs(void) +static struct dentry *kvm_init_debugfs(void) { d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); if (!d_kvm_debug) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 42caaef897c8..49487b488061 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -24,6 +24,7 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/memblock.h> +#include <linux/sched.h> #include <asm/x86_init.h> #include <asm/reboot.h> @@ -217,8 +218,10 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { + struct pvclock_vcpu_time_info *vcpu_time; unsigned long mem; - int size; + int size, cpu; + u8 flags; size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); @@ -264,7 +267,14 @@ void __init kvmclock_init(void) pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + pvclock_set_flags(~0); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + if (flags & PVCLOCK_COUNTS_FROM_ZERO) + set_sched_clock_stable(); + put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 413a7bf9efbb..d8a1d56276e1 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -86,15 +86,16 @@ config KVM_MMU_AUDIT auditing of KVM MMU events at runtime. config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support" + bool "KVM legacy PCI device assignment support (DEPRECATED)" depends on KVM && PCI && IOMMU_API - default y + default n ---help--- Provide support for legacy PCI device assignment through KVM. The kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes much of this support. + framework through VFIO, which supersedes this support and provides + better security. - If unsure, say Y. + If unsure, say N. # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 59b69f6a2844..9dadf8d67873 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,6 +16,8 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <asm/i387.h> /* For use_eager_fpu. Ugh! */ +#include <asm/fpu-internal.h> /* For use_eager_fpu. Ugh! */ #include <asm/user.h> #include <asm/xsave.h> #include "cpuid.h" @@ -95,6 +97,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu); + /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. @@ -411,6 +415,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 6: /* Thermal management */ + entry->eax = 0x4; /* allow ARAT */ + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; case 7: { entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ @@ -587,7 +597,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ - case 6: /* Thermal management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c3b1ad9fca81..dd05b9cef6ae 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_LM)); +} + static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -117,4 +125,12 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_RTM)); } + +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9b655d113fc6..e7a4fde5d631 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2259,6 +2259,260 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) return rc; } +static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = 0x80000001; + ecx = 0; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + return edx & bit(X86_FEATURE_LM); +} + +#define GET_SMSTATE(type, smbase, offset) \ + ({ \ + type __val; \ + int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ + sizeof(__val), NULL); \ + if (r != X86EMUL_CONTINUE) \ + return X86EMUL_UNHANDLEABLE; \ + __val; \ + }) + +static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) +{ + desc->g = (flags >> 23) & 1; + desc->d = (flags >> 22) & 1; + desc->l = (flags >> 21) & 1; + desc->avl = (flags >> 20) & 1; + desc->p = (flags >> 15) & 1; + desc->dpl = (flags >> 13) & 3; + desc->s = (flags >> 12) & 1; + desc->type = (flags >> 8) & 15; +} + +static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +{ + struct desc_struct desc; + int offset; + u16 selector; + + selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4); + + if (n < 3) + offset = 0x7f84 + n * 12; + else + offset = 0x7f2c + (n - 3) * 12; + + set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); + return X86EMUL_CONTINUE; +} + +static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +{ + struct desc_struct desc; + int offset; + u16 selector; + u32 base3; + + offset = 0x7e00 + n * 16; + + selector = GET_SMSTATE(u16, smbase, offset); + rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); + base3 = GET_SMSTATE(u32, smbase, offset + 12); + + ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); + return X86EMUL_CONTINUE; +} + +static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, + u64 cr0, u64 cr4) +{ + int bad; + + /* + * First enable PAE, long mode needs it before CR0.PG = 1 is set. + * Then enable protected mode. However, PCID cannot be enabled + * if EFER.LMA=0, so set it separately. + */ + bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + if (bad) + return X86EMUL_UNHANDLEABLE; + + bad = ctxt->ops->set_cr(ctxt, 0, cr0); + if (bad) + return X86EMUL_UNHANDLEABLE; + + if (cr4 & X86_CR4_PCIDE) { + bad = ctxt->ops->set_cr(ctxt, 4, cr4); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + + return X86EMUL_CONTINUE; +} + +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct desc_struct desc; + struct desc_ptr dt; + u16 selector; + u32 val, cr0, cr4; + int i; + + cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); + ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); + ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + + for (i = 0; i < 8; i++) + *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4); + + val = GET_SMSTATE(u32, smbase, 0x7fcc); + ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + val = GET_SMSTATE(u32, smbase, 0x7fc8); + ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + selector = GET_SMSTATE(u32, smbase, 0x7fc4); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); + + selector = GET_SMSTATE(u32, smbase, 0x7fc0); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); + + dt.address = GET_SMSTATE(u32, smbase, 0x7f74); + dt.size = GET_SMSTATE(u32, smbase, 0x7f70); + ctxt->ops->set_gdt(ctxt, &dt); + + dt.address = GET_SMSTATE(u32, smbase, 0x7f58); + dt.size = GET_SMSTATE(u32, smbase, 0x7f54); + ctxt->ops->set_idt(ctxt, &dt); + + for (i = 0; i < 6; i++) { + int r = rsm_load_seg_32(ctxt, smbase, i); + if (r != X86EMUL_CONTINUE) + return r; + } + + cr4 = GET_SMSTATE(u32, smbase, 0x7f14); + + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + + return rsm_enter_protected_mode(ctxt, cr0, cr4); +} + +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct desc_struct desc; + struct desc_ptr dt; + u64 val, cr0, cr4; + u32 base3; + u16 selector; + int i; + + for (i = 0; i < 16; i++) + *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); + + ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78); + ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED; + + val = GET_SMSTATE(u32, smbase, 0x7f68); + ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + val = GET_SMSTATE(u32, smbase, 0x7f60); + ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + cr0 = GET_SMSTATE(u64, smbase, 0x7f58); + ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); + cr4 = GET_SMSTATE(u64, smbase, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); + val = GET_SMSTATE(u64, smbase, 0x7ed0); + ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); + + selector = GET_SMSTATE(u32, smbase, 0x7e90); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98)); + base3 = GET_SMSTATE(u32, smbase, 0x7e9c); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); + + dt.size = GET_SMSTATE(u32, smbase, 0x7e84); + dt.address = GET_SMSTATE(u64, smbase, 0x7e88); + ctxt->ops->set_idt(ctxt, &dt); + + selector = GET_SMSTATE(u32, smbase, 0x7e70); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78)); + base3 = GET_SMSTATE(u32, smbase, 0x7e7c); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); + + dt.size = GET_SMSTATE(u32, smbase, 0x7e64); + dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + ctxt->ops->set_gdt(ctxt, &dt); + + for (i = 0; i < 6; i++) { + int r = rsm_load_seg_64(ctxt, smbase, i); + if (r != X86EMUL_CONTINUE) + return r; + } + + return rsm_enter_protected_mode(ctxt, cr0, cr4); +} + +static int em_rsm(struct x86_emulate_ctxt *ctxt) +{ + unsigned long cr0, cr4, efer; + u64 smbase; + int ret; + + if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + return emulate_ud(ctxt); + + /* + * Get back to real mode, to prepare a safe state in which to load + * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed + * to read_std/write_std are not virtual. + * + * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. + */ + cr0 = ctxt->ops->get_cr(ctxt, 0); + if (cr0 & X86_CR0_PE) + ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + + smbase = ctxt->ops->get_smbase(ctxt); + if (emulator_has_longmode(ctxt)) + ret = rsm_load_state_64(ctxt, smbase + 0x8000); + else + ret = rsm_load_state_32(ctxt, smbase + 0x8000); + + if (ret != X86EMUL_CONTINUE) { + /* FIXME: should triple fault */ + return X86EMUL_UNHANDLEABLE; + } + + if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + ctxt->ops->set_nmi_mask(ctxt, false); + + ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; + ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + return X86EMUL_CONTINUE; +} + static void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *cs, struct desc_struct *ss) @@ -4197,7 +4451,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - DI(ImplicitOps, rsm), + II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), @@ -4895,7 +5149,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -4924,7 +5178,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -4978,7 +5232,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 544076c4f44b..e1e89ee4af75 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu) return vcpu->arch.hflags & HF_GUEST_MASK; } +static inline bool is_smm(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hflags & HF_SMM_MASK; +} + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dc5b57bb1f76..beeef05bb4d9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } +static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) +{ + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + + apic_set_reg(apic, APIC_ID, id << 24); + apic_set_reg(apic, APIC_LDR, ldr); + recalculate_apic_map(apic->vcpu->kvm); +} + static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); @@ -799,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_SMI: - apic_debug("Ignoring guest SMI\n"); + result = 1; + kvm_make_request(KVM_REQ_SMI, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_NMI: @@ -1538,9 +1549,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & X2APIC_ENABLE) { if (value & X2APIC_ENABLE) { - u32 id = kvm_apic_id(apic); - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_apic_set_ldr(apic, ldr); + kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); } else kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); @@ -1587,7 +1596,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); apic_set_reg(apic, APIC_TASKPRI, 0); - kvm_apic_set_ldr(apic, 0); + if (!apic_x2apic_mode(apic)) + kvm_apic_set_ldr(apic, 0); apic_set_reg(apic, APIC_ESR, 0); apic_set_reg(apic, APIC_ICR, 0); apic_set_reg(apic, APIC_ICR2, 0); @@ -2047,8 +2057,19 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) return; - pe = xchg(&apic->pending_events, 0); + /* + * INITs are latched while in SMM. Because an SMM CPU cannot + * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs + * and delay processing of INIT until the next RSM. + */ + if (is_smm(vcpu)) { + WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); + if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) + clear_bit(KVM_APIC_SIPI, &apic->pending_events); + return; + } + pe = xchg(&apic->pending_events, 0); if (test_bit(KVM_APIC_INIT, &pe)) { kvm_lapic_reset(vcpu, true); kvm_vcpu_reset(vcpu, true); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 71b150cae5f9..f2f4e10ab772 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -150,7 +150,7 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm) static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic->pending_events; + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; } static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) @@ -159,6 +159,11 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) irq->msi_redir_hint); } +static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); +} + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 49c34e632b91..c88f0e443669 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte) return gen; } -static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) { - return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; + return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; } -static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, +static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - unsigned int gen = kvm_current_mmio_generation(kvm); + unsigned int gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; @@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte) return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(kvm, sptep, gfn, access); + mark_mmio_spte(vcpu, sptep, gfn, access); return true; } return false; } -static bool check_mmio_spte(struct kvm *kvm, u64 spte) +static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { unsigned int kvm_gen, spte_gen; - kvm_gen = kvm_current_mmio_generation(kvm); + kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); @@ -804,13 +804,17 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void account_shadowed(struct kvm *kvm, gfn_t gfn) +static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; + gfn_t gfn; int i; - slot = gfn_to_memslot(kvm, gfn); + gfn = sp->gfn; + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count += 1; @@ -818,13 +822,17 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) kvm->arch.indirect_shadow_pages++; } -static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) +static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; + gfn_t gfn; int i; - slot = gfn_to_memslot(kvm, gfn); + gfn = sp->gfn; + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count -= 1; @@ -833,14 +841,14 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm *kvm, +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = gfn_to_memslot(kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -872,7 +880,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(vcpu->kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID || (no_dirty_log && slot->dirty_bitmap)) slot = NULL; @@ -897,7 +905,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + if (has_wrprotected_page(vcpu, large_gfn, level)) break; return level - 1; @@ -1039,12 +1047,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, /* * Take gfn and return the reverse mapping to it. */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; - slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(gfn, level, slot); + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); + return __gfn_to_rmap(gfn, sp->role.level, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1062,7 +1072,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmapp); } @@ -1074,7 +1084,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, sp); pte_list_remove(spte, rmapp); } @@ -1332,18 +1342,18 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static bool rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; bool write_protected = false; - slot = gfn_to_memslot(kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, true); + write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true); } return write_protected; @@ -1499,30 +1509,33 @@ static int kvm_handle_hva_range(struct kvm *kvm, struct kvm_memory_slot *memslot; struct slot_rmap_walk_iterator iterator; int ret = 0; + int i; - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1, - &iterator) - ret |= handler(kvm, iterator.rmap, memslot, - iterator.gfn, iterator.level, data); + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, + gfn_start, gfn_end - 1, + &iterator) + ret |= handler(kvm, iterator.rmap, memslot, + iterator.gfn, iterator.level, data); + } } return ret; @@ -1608,7 +1621,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); @@ -2028,7 +2041,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, bool protected = false; for_each_sp(pages, sp, parents, i) - protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + protected |= rmap_write_protect(vcpu, sp->gfn); if (protected) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2126,12 +2139,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_add_head(&sp->hash_link, &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { - if (rmap_write_protect(vcpu->kvm, gfn)) + if (rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); if (level > PT_PAGE_TABLE_LEVEL && need_sync) kvm_sync_pages(vcpu, gfn); - account_shadowed(vcpu->kvm, gfn); + account_shadowed(vcpu->kvm, sp); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); @@ -2312,7 +2325,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mmu_unlink_parents(kvm, sp); if (!sp->role.invalid && !sp->role.direct) - unaccount_shadowed(kvm, sp->gfn); + unaccount_shadowed(kvm, sp); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); @@ -2577,7 +2590,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte; int ret = 0; - if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -2614,7 +2627,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * be fixed if guest refault. */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) + has_wrprotected_page(vcpu, gfn, level)) goto done; spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; @@ -2638,7 +2651,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } if (pte_access & ACC_WRITE_MASK) { - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); spte |= shadow_dirty_mask; } @@ -2728,15 +2741,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, u64 *start, u64 *end) { struct page *pages[PTE_PREFETCH_NUM]; + struct kvm_memory_slot *slot; unsigned access = sp->role.access; int i, ret; gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); + if (!slot) return -1; - ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); + ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); if (ret <= 0) return -1; @@ -2854,7 +2869,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) return 1; if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); + kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); return 0; } @@ -2877,7 +2892,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the @@ -2969,7 +2984,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * Compare with set_spte where instead shadow_dirty_mask is set. */ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); return true; } @@ -3424,7 +3439,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); - if (!check_mmio_spte(vcpu->kvm, spte)) + if (!check_mmio_spte(vcpu, spte)) return RET_MMIO_PF_INVALID; if (direct) @@ -3496,7 +3511,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); + return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) @@ -3514,7 +3529,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, struct kvm_memory_slot *slot; bool async; - slot = gfn_to_memslot(vcpu->kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); async = false; *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) @@ -3627,7 +3642,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, vcpu->arch.mmu.inject_page_fault(vcpu, fault); } -static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { @@ -3637,7 +3652,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, } (*nr_present)++; - mark_mmio_spte(kvm, sptep, gfn, access); + mark_mmio_spte(vcpu, sptep, gfn, access); return true; } @@ -3915,6 +3930,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; + context->base_role.smm = is_smm(vcpu); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -3976,6 +3992,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) = smep && !is_write_protection(vcpu); context->base_role.smap_andnot_wp = smap && !is_write_protection(vcpu); + context->base_role.smm = is_smm(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -4147,7 +4164,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); + r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); if (r) gentry = 0; new = (const u8 *)&gentry; @@ -4252,13 +4269,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush, zap_page; - union kvm_mmu_page_role mask = (union kvm_mmu_page_role) { - .cr0_wp = 1, - .cr4_pae = 1, - .nxe = 1, - .smep_andnot_wp = 1, - .smap_andnot_wp = 1, - }; + union kvm_mmu_page_role mask = { }; + + mask.cr0_wp = 1; + mask.cr4_pae = 1; + mask.nxe = 1; + mask.smep_andnot_wp = 1; + mask.smap_andnot_wp = 1; + mask.smm = 1; /* * If we don't have indirect shadow pages, it means no page is @@ -4530,21 +4548,23 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - - slots = kvm_memslots(kvm); + int i; spin_lock(&kvm->mmu_lock); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); + } } spin_unlock(&kvm->mmu_lock); @@ -4621,10 +4641,12 @@ restart: } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { + /* FIXME: const-ify all uses of struct kvm_memory_slot. */ spin_lock(&kvm->mmu_lock); - slot_handle_leaf(kvm, memslot, kvm_mmu_zap_collapsible_spte, true); + slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, + kvm_mmu_zap_collapsible_spte, true); spin_unlock(&kvm->mmu_lock); } @@ -4771,13 +4793,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) { /* * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { + if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } @@ -4899,15 +4921,18 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) unsigned int nr_pages = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + int i; - slots = kvm_memslots(kvm); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) - nr_pages += memslot->npages; + kvm_for_each_memslot(memslot, slots) + nr_pages += memslot->npages; + } nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 368d53497314..a4f62e6f2db2 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -114,7 +114,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) return; gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn); if (is_error_pfn(pfn)) return; @@ -131,12 +131,16 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); unsigned long *rmapp; struct kvm_mmu_page *rev_sp; + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; gfn_t gfn; rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - if (!gfn_to_memslot(kvm, gfn)) { + slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); + slot = __gfn_to_memslot(slots, gfn); + if (!slot) { if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no memslot for gfn %llx\n", gfn); @@ -146,7 +150,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); + rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!*rmapp) { if (!__ratelimit(&ratelimit_state)) return; @@ -191,11 +195,15 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) unsigned long *rmapp; u64 *sptep; struct rmap_iterator iter; + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; if (sp->role.direct || sp->unsync || sp->role.invalid) return; - rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, sp->gfn); + rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); for_each_rmap_spte(rmapp, &iter, sptep) if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6e6d115fe9b5..0f67d7e24800 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -256,7 +256,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (ret) return ret; - mark_page_dirty(vcpu->kvm, table_gfn); + kvm_vcpu_mark_page_dirty(vcpu, table_gfn); walker->ptes[level] = pte; } return 0; @@ -338,7 +338,7 @@ retry_walk: real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, + host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn, &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -511,11 +511,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, base_gpa = pte_gpa & ~mask; index = (pte_gpa - base_gpa) / sizeof(pt_element_t); - r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, + r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); curr_pte = gw->prefetch_ptes[index]; } else - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, + r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &curr_pte, sizeof(curr_pte)); return r || curr_pte != gw->ptes[level - 1]; @@ -869,8 +869,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (!rmap_can_add(vcpu)) break; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) break; FNAME(update_pte)(vcpu, sp, sptep, &gpte); @@ -956,8 +956,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { @@ -970,7 +970,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access &= FNAME(gpte_access)(vcpu, gpte); FNAME(protect_clean_gpte)(&pte_access, gpte); - if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) continue; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8954d7a07703..680753186489 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1955,8 +1955,8 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; - ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, - offset_in_page(cr3) + index * 8, 8); + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, + offset_in_page(cr3) + index * 8, 8); if (ret) return 0; return pdpte; @@ -2114,7 +2114,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) might_sleep(); - page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto error; @@ -2153,7 +2153,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) mask = (0xf >> (4 - size)) << start_bit; val = 0; - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len)) + if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len)) return NESTED_EXIT_DONE; return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; @@ -2178,7 +2178,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) /* Offset is in 32 bit units but need in 8 bit units */ offset *= 4; - if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) return NESTED_EXIT_DONE; return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; @@ -2449,7 +2449,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) p = msrpm_offsets[i]; offset = svm->nested.vmcb_msrpm + (p * 4); - if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) return false; svm->nested.msrpm[p] = svm->msrpm[p] | value; @@ -3069,42 +3069,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) svm_scale_tsc(vcpu, host_tsc); } -static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) +static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); - switch (ecx) { + switch (msr_info->index) { case MSR_IA32_TSC: { - *data = svm->vmcb->control.tsc_offset + + msr_info->data = svm->vmcb->control.tsc_offset + svm_scale_tsc(vcpu, native_read_tsc()); break; } case MSR_STAR: - *data = svm->vmcb->save.star; + msr_info->data = svm->vmcb->save.star; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - *data = svm->vmcb->save.lstar; + msr_info->data = svm->vmcb->save.lstar; break; case MSR_CSTAR: - *data = svm->vmcb->save.cstar; + msr_info->data = svm->vmcb->save.cstar; break; case MSR_KERNEL_GS_BASE: - *data = svm->vmcb->save.kernel_gs_base; + msr_info->data = svm->vmcb->save.kernel_gs_base; break; case MSR_SYSCALL_MASK: - *data = svm->vmcb->save.sfmask; + msr_info->data = svm->vmcb->save.sfmask; break; #endif case MSR_IA32_SYSENTER_CS: - *data = svm->vmcb->save.sysenter_cs; + msr_info->data = svm->vmcb->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: - *data = svm->sysenter_eip; + msr_info->data = svm->sysenter_eip; break; case MSR_IA32_SYSENTER_ESP: - *data = svm->sysenter_esp; + msr_info->data = svm->sysenter_esp; break; /* * Nobody will change the following 5 values in the VMCB so we can @@ -3112,31 +3112,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) * implemented. */ case MSR_IA32_DEBUGCTLMSR: - *data = svm->vmcb->save.dbgctl; + msr_info->data = svm->vmcb->save.dbgctl; break; case MSR_IA32_LASTBRANCHFROMIP: - *data = svm->vmcb->save.br_from; + msr_info->data = svm->vmcb->save.br_from; break; case MSR_IA32_LASTBRANCHTOIP: - *data = svm->vmcb->save.br_to; + msr_info->data = svm->vmcb->save.br_to; break; case MSR_IA32_LASTINTFROMIP: - *data = svm->vmcb->save.last_excp_from; + msr_info->data = svm->vmcb->save.last_excp_from; break; case MSR_IA32_LASTINTTOIP: - *data = svm->vmcb->save.last_excp_to; + msr_info->data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: - *data = svm->nested.hsave_msr; + msr_info->data = svm->nested.hsave_msr; break; case MSR_VM_CR: - *data = svm->nested.vm_cr_msr; + msr_info->data = svm->nested.vm_cr_msr; break; case MSR_IA32_UCODE_REV: - *data = 0x01000065; + msr_info->data = 0x01000065; break; default: - return kvm_get_msr_common(vcpu, ecx, data); + return kvm_get_msr_common(vcpu, msr_info); } return 0; } @@ -3144,16 +3144,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) static int rdmsr_interception(struct vcpu_svm *svm) { u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); - u64 data; + struct msr_data msr_info; - if (svm_get_msr(&svm->vcpu, ecx, &data)) { + msr_info.index = ecx; + msr_info.host_initiated = false; + if (svm_get_msr(&svm->vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); } else { - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(ecx, msr_info.data); - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff); - kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32); + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, + msr_info.data & 0xffffffff); + kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, + msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; skip_emulated_instruction(&svm->vcpu); } @@ -3390,6 +3394,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, + [SVM_EXIT_RSM] = emulate_on_interception, }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -4075,6 +4080,11 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } +static bool svm_has_high_real_mode_segbase(void) +{ + return true; +} + static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; @@ -4348,6 +4358,7 @@ static struct kvm_x86_ops svm_x86_ops = { .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, + .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, @@ -4383,6 +4394,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7c7bc8bef21f..4eae7c35ddf5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire, __entry->delta < 0 ? "early" : "late") ); +TRACE_EVENT(kvm_enter_smm, + TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering), + TP_ARGS(vcpu_id, smbase, entering), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( u64, smbase ) + __field( bool, entering ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->smbase = smbase; + __entry->entering = entering; + ), + + TP_printk("vcpu %u: %s SMM, smbase 0x%llx", + __entry->vcpu_id, + __entry->entering ? "entering" : "leaving", + __entry->smbase) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bcb61b024386..06a186b07631 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -786,7 +786,7 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) { - struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); + struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); if (is_error_page(page)) return NULL; @@ -2622,76 +2622,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - u64 data; struct shared_msr_entry *msr; - if (!pdata) { - printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); - return -EINVAL; - } - - switch (msr_index) { + switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: - data = vmcs_readl(GUEST_FS_BASE); + msr_info->data = vmcs_readl(GUEST_FS_BASE); break; case MSR_GS_BASE: - data = vmcs_readl(GUEST_GS_BASE); + msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: vmx_load_host_state(to_vmx(vcpu)); - data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; break; #endif case MSR_EFER: - return kvm_get_msr_common(vcpu, msr_index, pdata); + return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSC: - data = guest_read_tsc(); + msr_info->data = guest_read_tsc(); break; case MSR_IA32_SYSENTER_CS: - data = vmcs_read32(GUEST_SYSENTER_CS); + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: - data = vmcs_readl(GUEST_SYSENTER_EIP); + msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: - data = vmcs_readl(GUEST_SYSENTER_ESP); + msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!vmx_mpx_supported()) return 1; - data = vmcs_read64(GUEST_BNDCFGS); + msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; - data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; - return vmx_get_vmx_msr(vcpu, msr_index, pdata); + return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; - data = vcpu->arch.ia32_xss; + msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; /* Otherwise falls through */ default: - msr = find_msr_entry(to_vmx(vcpu), msr_index); + msr = find_msr_entry(to_vmx(vcpu), msr_info->index); if (msr) { - data = msr->data; + msr_info->data = msr->data; break; } - return kvm_get_msr_common(vcpu, msr_index, pdata); + return kvm_get_msr_common(vcpu, msr_info); } - *pdata = data; return 0; } @@ -4122,7 +4115,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; @@ -4157,7 +4150,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, &kvm_userspace_mem); return r; } @@ -4963,7 +4956,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) .flags = 0, }; - ret = kvm_set_memory_region(kvm, &tss_mem); + ret = x86_set_memory_region(kvm, &tss_mem); if (ret) return ret; kvm->arch.tss_addr = addr; @@ -5473,19 +5466,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu) static int handle_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data; + struct msr_data msr_info; - if (vmx_get_msr(vcpu, ecx, &data)) { + msr_info.index = ecx; + msr_info.host_initiated = false; + if (vmx_get_msr(vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(ecx, msr_info.data); /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; + vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; skip_emulated_instruction(vcpu); return 1; } @@ -7328,7 +7323,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, bitmap += (port & 0x7fff) / 8; if (last_bitmap != bitmap) - if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) + if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) return true; if (b & (1 << (port & 7))) return true; @@ -7372,7 +7367,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, /* Then read the msr_index'th bit from this bitmap: */ if (msr_index < 1024*8) { unsigned char b; - if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) + if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) return true; return 1 & (b >> (msr_index & 7)); } else @@ -7637,9 +7632,9 @@ static void vmx_disable_pml(struct vcpu_vmx *vmx) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } -static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) +static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vmx->vcpu.kvm; + struct vcpu_vmx *vmx = to_vmx(vcpu); u64 *pml_buf; u16 pml_idx; @@ -7661,7 +7656,7 @@ static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) gpa = pml_buf[pml_idx]; WARN_ON(gpa & (PAGE_SIZE - 1)); - mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } /* reset PML index */ @@ -7856,7 +7851,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) * flushed already. */ if (enable_pml) - vmx_flush_pml_buffer(vmx); + vmx_flush_pml_buffer(vcpu); /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -8144,6 +8139,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_has_high_real_mode_segbase(void) +{ + return enable_unrestricted_guest || emulate_invalid_guest_state; +} + static bool vmx_mpx_supported(void) { return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && @@ -9114,8 +9114,8 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr.host_initiated = false; for (i = 0; i < count; i++) { - if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), - &e, sizeof(e))) { + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), + &e, sizeof(e))) { pr_warn_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); @@ -9147,9 +9147,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) struct vmx_msr_entry e; for (i = 0; i < count; i++) { - if (kvm_read_guest(vcpu->kvm, - gpa + i * sizeof(e), - &e, 2 * sizeof(u32))) { + struct msr_data msr_info; + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { pr_warn_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); @@ -9161,19 +9162,21 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); return -EINVAL; } - if (kvm_get_msr(vcpu, e.index, &e.value)) { + msr_info.host_initiated = false; + msr_info.index = e.index; + if (kvm_get_msr(vcpu, &msr_info)) { pr_warn_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); return -EINVAL; } - if (kvm_write_guest(vcpu->kvm, - gpa + i * sizeof(e) + - offsetof(struct vmx_msr_entry, value), - &e.value, sizeof(e.value))) { + if (kvm_vcpu_write_guest(vcpu, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &msr_info.data, sizeof(msr_info.data))) { pr_warn_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, e.value); + __func__, i, e.index, msr_info.data); return -EINVAL; } } @@ -10298,6 +10301,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, + .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@ -10333,6 +10337,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 457b908244f2..43f0df7ddc9c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -99,6 +99,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); unsigned int min_timer_period_us = 500; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); +static bool __read_mostly kvmclock_periodic_sync = true; +module_param(kvmclock_periodic_sync, bool, S_IRUGO); + bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; @@ -475,7 +478,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr); /* * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_read_guest_page is that this function + * running guest. The difference to kvm_vcpu_read_guest_page is that this function * can read from guest physical or from the guest's guest physical memory. */ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, @@ -493,7 +496,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, real_gfn = gpa_to_gfn(real_gfn); - return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); + return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); @@ -922,17 +925,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * * This list is modified at module load time to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in the beginning of the list. + * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * may depend on host virtualization features rather than host cpu features. */ -#define KVM_SAVE_MSRS_BEGIN 12 static u32 msrs_to_save[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -944,14 +941,24 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; -static const u32 emulated_msrs[] = { +static u32 emulated_msrs[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, + MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_SMBASE, }; +static unsigned num_emulated_msrs; + bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) @@ -1045,6 +1052,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention */ +static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + struct msr_data msr; + int r; + + msr.index = index; + msr.host_initiated = true; + r = kvm_get_msr(vcpu, &msr); + if (r) + return r; + + *data = msr.data; + return 0; +} + static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { struct msr_data msr; @@ -1697,6 +1719,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->pvclock_set_guest_stopped_request = false; } + pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO; + /* If the host uses TSC clocksource, then it is stable */ if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; @@ -1767,6 +1791,9 @@ static void kvmclock_sync_fn(struct work_struct *work) kvmclock_sync_work); struct kvm *kvm = container_of(ka, struct kvm, arch); + if (!kvmclock_periodic_sync) + return; + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); @@ -2003,7 +2030,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) r = PTR_ERR(page); goto out; } - if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) goto out_free; r = 0; out_free: @@ -2103,13 +2130,13 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; } gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(vcpu->kvm, gfn); + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; if (__clear_user((void __user *)addr, PAGE_SIZE)) return 1; vcpu->arch.hv_vapic = data; - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) return 1; break; @@ -2256,6 +2283,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_IA32_SMBASE: + if (!msr_info->host_initiated) + return 1; + vcpu->arch.smbase = data; + break; case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; @@ -2276,6 +2308,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &vcpu->requests); ka->boot_vcpu_runs_old_kvmclock = tmp; + + ka->kvmclock_offset = -get_kernel_ns(); } vcpu->arch.time = data; @@ -2435,9 +2469,9 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); + return kvm_x86_ops->get_msr(vcpu, msr); } EXPORT_SYMBOL_GPL(kvm_get_msr); @@ -2574,11 +2608,11 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 data; - switch (msr) { + switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: @@ -2601,26 +2635,26 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: - data = 0; + msr_info->data = 0; break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_get_msr(vcpu, msr, pdata); - data = 0; + if (kvm_pmu_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + msr_info->data = 0; break; case MSR_IA32_UCODE_REV: - data = 0x100000000ULL; + msr_info->data = 0x100000000ULL; break; case MSR_MTRRcap: - data = 0x500 | KVM_NR_VAR_MTRR; + msr_info->data = 0x500 | KVM_NR_VAR_MTRR; break; case 0x200 ... 0x2ff: - return get_msr_mtrr(vcpu, msr, pdata); + return get_msr_mtrr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ - data = 3; + msr_info->data = 3; break; /* * MSR_EBC_FREQUENCY_ID @@ -2634,48 +2668,53 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * multiplying by zero otherwise. */ case MSR_EBC_FREQUENCY_ID: - data = 1 << 24; + msr_info->data = 1 << 24; break; case MSR_IA32_APICBASE: - data = kvm_get_apic_base(vcpu); + msr_info->data = kvm_get_apic_base(vcpu); break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_read(vcpu, msr, pdata); + return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_TSCDEADLINE: - data = kvm_get_lapic_tscdeadline_msr(vcpu); + msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: - data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; case MSR_IA32_MISC_ENABLE: - data = vcpu->arch.ia32_misc_enable_msr; + msr_info->data = vcpu->arch.ia32_misc_enable_msr; + break; + case MSR_IA32_SMBASE: + if (!msr_info->host_initiated) + return 1; + msr_info->data = vcpu->arch.smbase; break; case MSR_IA32_PERF_STATUS: /* TSC increment by tick */ - data = 1000ULL; + msr_info->data = 1000ULL; /* CPU multiplier */ data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.efer; + msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: case MSR_KVM_WALL_CLOCK_NEW: - data = vcpu->kvm->arch.wall_clock; + msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: case MSR_KVM_SYSTEM_TIME_NEW: - data = vcpu->arch.time; + msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: - data = vcpu->arch.apf.msr_val; + msr_info->data = vcpu->arch.apf.msr_val; break; case MSR_KVM_STEAL_TIME: - data = vcpu->arch.st.msr_val; + msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: - data = vcpu->arch.pv_eoi.msr_val; + msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: @@ -2683,7 +2722,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr, pdata); + return get_msr_mce(vcpu, msr_info->index, &msr_info->data); case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2694,17 +2733,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * type 6, model 8 and higher from exploding due to * the rdmsr failing. */ - data = 0x20000000; + msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { + if (kvm_hv_msr_partition_wide(msr_info->index)) { int r; mutex_lock(&vcpu->kvm->lock); - r = get_msr_hyperv_pw(vcpu, msr, pdata); + r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data); mutex_unlock(&vcpu->kvm->lock); return r; } else - return get_msr_hyperv(vcpu, msr, pdata); + return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current @@ -2717,31 +2756,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * L2 cache control register 3: 64GB range, 256KB size, * enabled, latency 0x1, configured */ - data = 0xbe702111; + msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) return 1; - data = vcpu->arch.osvw.length; + msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: if (!guest_cpuid_has_osvw(vcpu)) return 1; - data = vcpu->arch.osvw.status; + msr_info->data = vcpu->arch.osvw.status; break; default: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_get_msr(vcpu, msr, pdata); + if (kvm_pmu_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); return 1; } else { - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); - data = 0; + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); + msr_info->data = 0; } break; } - *pdata = data; return 0; } EXPORT_SYMBOL_GPL(kvm_get_msr_common); @@ -2862,6 +2900,17 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #endif r = 1; break; + case KVM_CAP_X86_SMM: + /* SMBASE is usually relocated above 1M on modern chipsets, + * and SMM handlers might indeed rely on 4G segment limits, + * so do not report SMM to be available if real mode is + * emulated via vm86 mode. Still, do not go to great lengths + * to avoid userspace's usage of the feature, because it is a + * fringe case that is not enabled except via specific settings + * of the module parameters. + */ + r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); + break; case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; @@ -2918,7 +2967,7 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) goto out; n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) goto out; r = -E2BIG; @@ -2930,7 +2979,7 @@ long kvm_arch_dev_ioctl(struct file *filp, goto out; if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, - ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + num_emulated_msrs * sizeof(u32))) goto out; r = 0; break; @@ -3074,6 +3123,13 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) return 0; } +static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_SMI, vcpu); + + return 0; +} + static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -3179,8 +3235,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->sipi_vector = 0; /* never valid when reporting to user space */ + events->smi.smm = is_smm(vcpu); + events->smi.pending = vcpu->arch.smi_pending; + events->smi.smm_inside_nmi = + !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); + events->smi.latched_init = kvm_lapic_latched_init(vcpu); + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SHADOW); + | KVM_VCPUEVENT_VALID_SHADOW + | KVM_VCPUEVENT_VALID_SMM); memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -3189,7 +3252,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW)) + | KVM_VCPUEVENT_VALID_SHADOW + | KVM_VCPUEVENT_VALID_SMM)) return -EINVAL; process_nmi(vcpu); @@ -3214,6 +3278,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_vcpu_has_lapic(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; + if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + if (events->smi.smm) + vcpu->arch.hflags |= HF_SMM_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_MASK; + vcpu->arch.smi_pending = events->smi.pending; + if (events->smi.smm_inside_nmi) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; + if (kvm_vcpu_has_lapic(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + } + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -3473,6 +3555,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_nmi(vcpu); break; } + case KVM_SMI: { + r = kvm_vcpu_ioctl_smi(vcpu); + break; + } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; @@ -3512,7 +3598,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_MSRS: - r = msr_io(vcpu, argp, kvm_get_msr, 1); + r = msr_io(vcpu, argp, do_get_msr, 1); break; case KVM_SET_MSRS: r = msr_io(vcpu, argp, do_set_msr, 0); @@ -4196,8 +4282,7 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i, j; - /* skip the first msrs in the list. KVM-specific */ - for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; @@ -4222,6 +4307,22 @@ static void kvm_init_msr_list(void) j++; } num_msrs_to_save = j; + + for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { + switch (emulated_msrs[i]) { + case MSR_IA32_SMBASE: + if (!kvm_x86_ops->cpu_has_high_real_mode_segbase()) + continue; + break; + default: + break; + } + + if (j < i) + emulated_msrs[j] = emulated_msrs[i]; + j++; + } + num_emulated_msrs = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -4339,8 +4440,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data, - offset, toread); + ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, + offset, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4373,8 +4474,8 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, offset = addr & (PAGE_SIZE-1); if (WARN_ON(offset + bytes > PAGE_SIZE)) bytes = (unsigned)PAGE_SIZE - offset; - ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val, - offset, bytes); + ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val, + offset, bytes); if (unlikely(ret < 0)) return X86EMUL_IO_NEEDED; @@ -4420,7 +4521,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); + ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4473,7 +4574,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { int ret; - ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes); if (ret < 0) return 0; kvm_mmu_pte_write(vcpu, gpa, val, bytes); @@ -4507,7 +4608,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); + return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes); } static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4705,7 +4806,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) goto emul_write; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto emul_write; @@ -4733,7 +4834,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; - mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -5032,7 +5133,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct msr_data msr; + int r; + + msr.index = msr_index; + msr.host_initiated = false; + r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); + if (r) + return r; + + *pdata = msr.data; + return 0; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, @@ -5046,6 +5157,20 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } +static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + return vcpu->arch.smbase; +} + +static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + vcpu->arch.smbase = smbase; +} + static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { @@ -5131,6 +5256,8 @@ static const struct x86_emulate_ops emulate_ops = { .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, + .get_smbase = emulator_get_smbase, + .set_smbase = emulator_set_smbase, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, @@ -5192,7 +5319,10 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - ctxt->guest_mode = is_guest_mode(vcpu); + BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); + BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); + BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); + ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5361,6 +5491,34 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); +static void kvm_smm_changed(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.hflags & HF_SMM_MASK)) { + /* This is a good place to trace that we are exiting SMM. */ + trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); + + if (unlikely(vcpu->arch.smi_pending)) { + kvm_make_request(KVM_REQ_SMI, vcpu); + vcpu->arch.smi_pending = 0; + } else { + /* Process a latched INIT, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + } + + kvm_mmu_reset_context(vcpu); +} + +static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) +{ + unsigned changed = vcpu->arch.hflags ^ emul_flags; + + vcpu->arch.hflags = emul_flags; + + if (changed & HF_SMM_MASK) + kvm_smm_changed(vcpu); +} + static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -5560,6 +5718,8 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + if (vcpu->arch.hflags != ctxt->emul_flags) + kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) kvm_vcpu_check_singlestep(vcpu, rflags, &r); @@ -6126,6 +6286,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); if (irqchip_in_kernel(vcpu->kvm)) @@ -6249,6 +6410,233 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +#define put_smstate(type, buf, offset, val) \ + *(type *)((buf) + (offset) - 0x7e00) = val + +static u32 process_smi_get_segment_flags(struct kvm_segment *seg) +{ + u32 flags = 0; + flags |= seg->g << 23; + flags |= seg->db << 22; + flags |= seg->l << 21; + flags |= seg->avl << 20; + flags |= seg->present << 15; + flags |= seg->dpl << 13; + flags |= seg->s << 12; + flags |= seg->type << 8; + return flags; +} + +static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) +{ + struct kvm_segment seg; + int offset; + + kvm_get_segment(vcpu, &seg, n); + put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector); + + if (n < 3) + offset = 0x7f84 + n * 12; + else + offset = 0x7f2c + (n - 3) * 12; + + put_smstate(u32, buf, offset + 8, seg.base); + put_smstate(u32, buf, offset + 4, seg.limit); + put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg)); +} + +static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) +{ + struct kvm_segment seg; + int offset; + u16 flags; + + kvm_get_segment(vcpu, &seg, n); + offset = 0x7e00 + n * 16; + + flags = process_smi_get_segment_flags(&seg) >> 8; + put_smstate(u16, buf, offset, seg.selector); + put_smstate(u16, buf, offset + 2, flags); + put_smstate(u32, buf, offset + 4, seg.limit); + put_smstate(u64, buf, offset + 8, seg.base); +} + +static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) +{ + struct desc_ptr dt; + struct kvm_segment seg; + unsigned long val; + int i; + + put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); + put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); + put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); + put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); + + for (i = 0; i < 8; i++) + put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i)); + + kvm_get_dr(vcpu, 6, &val); + put_smstate(u32, buf, 0x7fcc, (u32)val); + kvm_get_dr(vcpu, 7, &val); + put_smstate(u32, buf, 0x7fc8, (u32)val); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); + put_smstate(u32, buf, 0x7fc4, seg.selector); + put_smstate(u32, buf, 0x7f64, seg.base); + put_smstate(u32, buf, 0x7f60, seg.limit); + put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg)); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); + put_smstate(u32, buf, 0x7fc0, seg.selector); + put_smstate(u32, buf, 0x7f80, seg.base); + put_smstate(u32, buf, 0x7f7c, seg.limit); + put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg)); + + kvm_x86_ops->get_gdt(vcpu, &dt); + put_smstate(u32, buf, 0x7f74, dt.address); + put_smstate(u32, buf, 0x7f70, dt.size); + + kvm_x86_ops->get_idt(vcpu, &dt); + put_smstate(u32, buf, 0x7f58, dt.address); + put_smstate(u32, buf, 0x7f54, dt.size); + + for (i = 0; i < 6; i++) + process_smi_save_seg_32(vcpu, buf, i); + + put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); + + /* revision id */ + put_smstate(u32, buf, 0x7efc, 0x00020000); + put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); +} + +static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) +{ +#ifdef CONFIG_X86_64 + struct desc_ptr dt; + struct kvm_segment seg; + unsigned long val; + int i; + + for (i = 0; i < 16; i++) + put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i)); + + put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); + put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); + + kvm_get_dr(vcpu, 6, &val); + put_smstate(u64, buf, 0x7f68, val); + kvm_get_dr(vcpu, 7, &val); + put_smstate(u64, buf, 0x7f60, val); + + put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); + put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); + put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); + + put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase); + + /* revision id */ + put_smstate(u32, buf, 0x7efc, 0x00020064); + + put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); + put_smstate(u16, buf, 0x7e90, seg.selector); + put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u32, buf, 0x7e94, seg.limit); + put_smstate(u64, buf, 0x7e98, seg.base); + + kvm_x86_ops->get_idt(vcpu, &dt); + put_smstate(u32, buf, 0x7e84, dt.size); + put_smstate(u64, buf, 0x7e88, dt.address); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); + put_smstate(u16, buf, 0x7e70, seg.selector); + put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u32, buf, 0x7e74, seg.limit); + put_smstate(u64, buf, 0x7e78, seg.base); + + kvm_x86_ops->get_gdt(vcpu, &dt); + put_smstate(u32, buf, 0x7e64, dt.size); + put_smstate(u64, buf, 0x7e68, dt.address); + + for (i = 0; i < 6; i++) + process_smi_save_seg_64(vcpu, buf, i); +#else + WARN_ON_ONCE(1); +#endif +} + +static void process_smi(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ds; + char buf[512]; + u32 cr0; + + if (is_smm(vcpu)) { + vcpu->arch.smi_pending = true; + return; + } + + trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); + vcpu->arch.hflags |= HF_SMM_MASK; + memset(buf, 0, 512); + if (guest_cpuid_has_longmode(vcpu)) + process_smi_save_state_64(vcpu, buf); + else + process_smi_save_state_32(vcpu, buf); + + kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); + + if (kvm_x86_ops->get_nmi_mask(vcpu)) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + kvm_x86_ops->set_nmi_mask(vcpu, true); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0x8000); + + cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; + + kvm_x86_ops->set_cr4(vcpu, 0); + + __kvm_set_dr(vcpu, 7, DR7_FIXED_1); + + cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; + cs.base = vcpu->arch.smbase; + + ds.selector = 0; + ds.base = 0; + + cs.limit = ds.limit = 0xffffffff; + cs.type = ds.type = 0x3; + cs.dpl = ds.dpl = 0; + cs.db = ds.db = 0; + cs.s = ds.s = 1; + cs.l = ds.l = 0; + cs.g = ds.g = 1; + cs.avl = ds.avl = 0; + cs.present = ds.present = 1; + cs.unusable = ds.unusable = 0; + cs.padding = ds.padding = 0; + + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); + kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); + + if (guest_cpuid_has_longmode(vcpu)) + kvm_x86_ops->set_efer(vcpu, 0); + + kvm_update_cpuid(vcpu); + kvm_mmu_reset_context(vcpu); +} + static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; @@ -6282,6 +6670,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); /* @@ -6355,6 +6745,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) @@ -7155,7 +7547,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) * Every 255 times fpu_counter rolls over to 0; a guest that uses * the FPU in bursts will revert to loading it on demand. */ - if (!use_eager_fpu()) { + if (!vcpu->arch.eager_fpu) { if (++vcpu->fpu_counter < 5) kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); } @@ -7174,11 +7566,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + struct kvm_vcpu *vcpu; + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return kvm_x86_ops->vcpu_create(kvm, id); + + vcpu = kvm_x86_ops->vcpu_create(kvm, id); + + /* + * Activate fpu unconditionally in case the guest needs eager FPU. It will be + * deactivated soon if it doesn't. + */ + kvm_x86_ops->fpu_activate(vcpu); + return vcpu; } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) @@ -7209,6 +7611,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + if (!kvmclock_periodic_sync) + return; + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); } @@ -7229,6 +7634,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + vcpu->arch.hflags = 0; + atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; @@ -7254,8 +7661,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (!init_event) + if (!init_event) { kvm_pmu_reset(vcpu); + vcpu->arch.smbase = 0x30000; + } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); vcpu->arch.regs_avail = ~0; @@ -7571,6 +7980,40 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } +int __x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem) +{ + int i, r; + + /* Called with kvm->slots_lock held. */ + BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + struct kvm_userspace_memory_region m = *mem; + + m.slot |= i << 16; + r = __kvm_set_memory_region(kvm, &m); + if (r < 0) + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(__x86_set_memory_region); + +int x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem) +{ + int r; + + mutex_lock(&kvm->slots_lock); + r = __x86_set_memory_region(kvm, mem); + mutex_unlock(&kvm->slots_lock); + + return r; +} +EXPORT_SYMBOL_GPL(x86_set_memory_region); + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { @@ -7582,13 +8025,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) struct kvm_userspace_memory_region mem; memset(&mem, 0, sizeof(mem)); mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); mem.slot = TSS_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); } kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); @@ -7677,18 +8120,18 @@ out_free: return -ENOMEM; } -void kvm_arch_memslots_updated(struct kvm *kvm) +void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) { /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ - kvm_mmu_invalidate_mmio_sptes(kvm); + kvm_mmu_invalidate_mmio_sptes(kvm, slots); } int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { /* @@ -7766,14 +8209,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { - struct kvm_memory_slot *new; int nr_mmu_pages = 0; - if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { + if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) { int ret; ret = vm_munmap(old->userspace_addr, @@ -7790,9 +8233,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - /* It's OK to get 'new' slot here as it has already been installed */ - new = id_to_memslot(kvm->memslots, mem->slot); - /* * Dirty logging tracks sptes in 4k granularity, meaning that large * sptes have to be split. If live migration is successful, the guest @@ -7817,9 +8257,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * been zapped so no dirty logging staff is needed for old slot. For * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the * new and it's also covered when dealing with the new slot. + * + * FIXME: const-ify all uses of struct kvm_memory_slot. */ if (change != KVM_MR_DELETE) - kvm_mmu_slot_apply_flags(kvm, new); + kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); } void kvm_arch_flush_shadow_all(struct kvm *kvm) |