diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-05-30 13:07:36 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-05-30 13:07:36 +1000 |
commit | d0e9b7c44065a23715d27951578ad180b102d4bd (patch) | |
tree | f216e3fd9fa88c4f44d4730b3798197c70f3d335 | |
parent | 63dc3558e3d736822dbc48ab064945d84ee3d748 (diff) | |
parent | a2bdbb2c7e70a76c8618deb264d235b8d2ed067c (diff) |
Merge remote-tracking branch 'kvms390/next'
-rw-r--r-- | Documentation/virtual/kvm/api.txt | 135 | ||||
-rw-r--r-- | Documentation/virtual/kvm/devices/vm.txt | 33 | ||||
-rw-r--r-- | arch/s390/include/asm/kvm_host.h | 9 | ||||
-rw-r--r-- | arch/s390/include/uapi/asm/kvm.h | 6 | ||||
-rw-r--r-- | arch/s390/kvm/kvm-s390.c | 361 | ||||
-rw-r--r-- | arch/s390/kvm/priv.c | 103 | ||||
-rw-r--r-- | include/uapi/linux/kvm.h | 33 |
7 files changed, 674 insertions, 6 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4029943887a3..912b7df8215a 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3255,6 +3255,141 @@ Otherwise, if the MCE is a corrected error, KVM will just store it in the corresponding bank (provided this bank is not holding a previously reported uncorrected error). +4.107 KVM_S390_GET_CMMA_BITS + +Capability: KVM_CAP_S390_CMMA_MIGRATION +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_cmma_log (in, out) +Returns: 0 on success, a negative value on error + +This ioctl is used to get the values of the CMMA bits on the s390 +architecture. It is meant to be used in two scenarios: +- During live migration to save the CMMA values. Live migration needs + to be enabled via the KVM_REQ_START_MIGRATION VM property. +- To non-destructively peek at the CMMA values, with the flag + KVM_S390_CMMA_PEEK set. + +The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired +values are written to a buffer whose location is indicated via the "values" +member in the kvm_s390_cmma_log struct. The values in the input struct are +also updated as needed. +Each CMMA value takes up one byte. + +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +start_gfn is the number of the first guest frame whose CMMA values are +to be retrieved, + +count is the length of the buffer in bytes, + +values points to the buffer where the result will be written to. + +If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be +KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with +other ioctls. + +The result is written in the buffer pointed to by the field values, and +the values of the input parameter are updated as follows. + +Depending on the flags, different actions are performed. The only +supported flag so far is KVM_S390_CMMA_PEEK. + +The default behaviour if KVM_S390_CMMA_PEEK is not set is: +start_gfn will indicate the first page frame whose CMMA bits were dirty. +It is not necessarily the same as the one passed as input, as clean pages +are skipped. + +count will indicate the number of bytes actually written in the buffer. +It can (and very often will) be smaller than the input value, since the +buffer is only filled until 16 bytes of clean values are found (which +are then not copied in the buffer). Since a CMMA migration block needs +the base address and the length, for a total of 16 bytes, we will send +back some clean data if there is some dirty data afterwards, as long as +the size of the clean data does not exceed the size of the header. This +allows to minimize the amount of data to be saved or transferred over +the network at the expense of more roundtrips to userspace. The next +invocation of the ioctl will skip over all the clean values, saving +potentially more than just the 16 bytes we found. + +If KVM_S390_CMMA_PEEK is set: +the existing storage attributes are read even when not in migration +mode, and no other action is performed; + +the output start_gfn will be equal to the input start_gfn, + +the output count will be equal to the input count, except if the end of +memory has been reached. + +In both cases: +the field "remaining" will indicate the total number of dirty CMMA values +still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is +not enabled. + +mask is unused. + +values points to the userspace buffer where the result will be stored. + +This ioctl can fail with -ENOMEM if not enough memory can be allocated to +complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if +KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with +-EFAULT if the userspace address is invalid or if no page table is +present for the addresses (e.g. when using hugepages). + +4.108 KVM_S390_SET_CMMA_BITS + +Capability: KVM_CAP_S390_CMMA_MIGRATION +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_cmma_log (in) +Returns: 0 on success, a negative value on error + +This ioctl is used to set the values of the CMMA bits on the s390 +architecture. It is meant to be used during live migration to restore +the CMMA values, but there are no restrictions on its use. +The ioctl takes parameters via the kvm_s390_cmma_values struct. +Each CMMA value takes up one byte. + +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +start_gfn indicates the starting guest frame number, + +count indicates how many values are to be considered in the buffer, + +flags is not used and must be 0. + +mask indicates which PGSTE bits are to be considered. + +remaining is not used. + +values points to the buffer in userspace where to store the values. + +This ioctl can fail with -ENOMEM if not enough memory can be allocated to +complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if +the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or +if the flags field was not 0, with -EFAULT if the userspace address is +invalid, if invalid pages are written to (e.g. after the end of memory) +or if no page table is present for the addresses (e.g. when using +hugepages). + 5. The kvm_run structure ------------------------ diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt index 575ccb022aac..903fc926860b 100644 --- a/Documentation/virtual/kvm/devices/vm.txt +++ b/Documentation/virtual/kvm/devices/vm.txt @@ -222,3 +222,36 @@ Allows user space to disable dea key wrapping, clearing the wrapping key. Parameters: none Returns: 0 + +5. GROUP: KVM_S390_VM_MIGRATION +Architectures: s390 + +5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o) + +Allows userspace to stop migration mode, needed for PGSTE migration. +Setting this attribute when migration mode is not active will have no +effects. + +Parameters: none +Returns: 0 + +5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o) + +Allows userspace to start migration mode, needed for PGSTE migration. +Setting this attribute when migration mode is already active will have +no effects. + +Parameters: none +Returns: -ENOMEM if there is not enough free memory to start migration mode + -EINVAL if the state of the VM is invalid (e.g. no memory defined) + 0 in case of success. + +5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o) + +Allows userspace to query the status of migration mode. + +Parameters: address of a buffer in user space to store the data (u64) to; + the data itself is either 0 if migration mode is disabled or 1 + if it is enabled +Returns: -EFAULT if the given address is not accessible from kernel space + 0 in case of success. diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 426614a882a9..a8cafed79eb4 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -45,6 +45,8 @@ #define KVM_REQ_ENABLE_IBS 8 #define KVM_REQ_DISABLE_IBS 9 #define KVM_REQ_ICPT_OPEREXC 10 +#define KVM_REQ_START_MIGRATION 11 +#define KVM_REQ_STOP_MIGRATION 12 #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -691,6 +693,12 @@ struct kvm_s390_vsie { struct page *pages[KVM_MAX_VCPUS]; }; +struct kvm_s390_migration_state { + unsigned long bitmap_size; /* in bits (number of guest pages) */ + atomic64_t dirty_pages; /* number of dirty pages */ + unsigned long *pgste_bitmap; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -718,6 +726,7 @@ struct kvm_arch{ struct kvm_s390_crypto crypto; struct kvm_s390_vsie vsie; u64 epoch; + struct kvm_s390_migration_state *migration_state; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); }; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 3dd2a1d308dd..d6879a916de5 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -70,6 +70,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_TOD 1 #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 +#define KVM_S390_VM_MIGRATION 4 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 @@ -151,6 +152,11 @@ struct kvm_s390_vm_cpu_subfunc { #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +/* kvm attributes for migration mode */ +#define KVM_S390_VM_MIGRATION_STOP 0 +#define KVM_S390_VM_MIGRATION_START 1 +#define KVM_S390_VM_MIGRATION_STATUS 2 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 689ac48361c6..4ef3035d4dc4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -30,6 +30,7 @@ #include <linux/vmalloc.h> #include <linux/bitmap.h> #include <linux/sched/signal.h> +#include <linux/string.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> @@ -386,6 +387,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: case KVM_CAP_S390_USER_INSTR0: + case KVM_CAP_S390_CMMA_MIGRATION: case KVM_CAP_S390_AIS: r = 1; break; @@ -750,6 +752,131 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) +{ + int cx; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(cx, vcpu, kvm) + kvm_s390_sync_request(req, vcpu); +} + +/* + * Must be called with kvm->srcu held to avoid races on memslots, and with + * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration. + */ +static int kvm_s390_vm_start_migration(struct kvm *kvm) +{ + struct kvm_s390_migration_state *mgs; + struct kvm_memory_slot *ms; + /* should be the only one */ + struct kvm_memslots *slots; + unsigned long ram_pages; + int slotnr; + + /* migration mode already enabled */ + if (kvm->arch.migration_state) + return 0; + + slots = kvm_memslots(kvm); + if (!slots) + return -EINVAL; + if (!slots->used_slots) + return -EINVAL; + + mgs = kzalloc(sizeof(*mgs), GFP_KERNEL); + if (!mgs) + return -ENOMEM; + kvm->arch.migration_state = mgs; + + if (kvm->arch.use_cmma && slots) { + /* + * Get the last slot. They should be sorted by base_gfn, so the + * last slot is also the one at the end of the address space. + * We have verified above that at least one slot is present. + */ + ms = slots->memslots + slots->used_slots - 1; + /* round up so we only use full longs */ + ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG); + /* allocate enough bytes to store all the bits */ + mgs->pgste_bitmap = vmalloc(ram_pages / 8); + if (!mgs->pgste_bitmap) { + kfree(mgs); + kvm->arch.migration_state = NULL; + return -ENOMEM; + } + + mgs->bitmap_size = ram_pages; + atomic64_set(&mgs->dirty_pages, ram_pages); + /* mark all the pages in active slots as dirty */ + for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { + ms = slots->memslots + slotnr; + bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages); + } + + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); + } + return 0; +} + +/* + * Must be called with kvm->lock to avoid races with ourselves and + * kvm_s390_vm_start_migration. + */ +static int kvm_s390_vm_stop_migration(struct kvm *kvm) +{ + struct kvm_s390_migration_state *mgs; + + /* migration mode already disabled */ + if (!kvm->arch.migration_state) + return 0; + mgs = kvm->arch.migration_state; + kvm->arch.migration_state = NULL; + + if (kvm->arch.use_cmma) { + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); + vfree(mgs->pgste_bitmap); + } + kfree(mgs); + return 0; +} + +static int kvm_s390_vm_set_migration(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + int idx, res = -ENXIO; + + mutex_lock(&kvm->lock); + switch (attr->attr) { + case KVM_S390_VM_MIGRATION_START: + idx = srcu_read_lock(&kvm->srcu); + res = kvm_s390_vm_start_migration(kvm); + srcu_read_unlock(&kvm->srcu, idx); + break; + case KVM_S390_VM_MIGRATION_STOP: + res = kvm_s390_vm_stop_migration(kvm); + break; + default: + break; + } + mutex_unlock(&kvm->lock); + + return res; +} + +static int kvm_s390_vm_get_migration(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u64 mig = (kvm->arch.migration_state != NULL); + + if (attr->attr != KVM_S390_VM_MIGRATION_STATUS) + return -ENXIO; + + if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig))) + return -EFAULT; + return 0; +} + static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) { u8 gtod_high; @@ -1090,6 +1217,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CRYPTO: ret = kvm_s390_vm_set_crypto(kvm, attr); break; + case KVM_S390_VM_MIGRATION: + ret = kvm_s390_vm_set_migration(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1112,6 +1242,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MODEL: ret = kvm_s390_get_cpu_model(kvm, attr); break; + case KVM_S390_VM_MIGRATION: + ret = kvm_s390_vm_get_migration(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1179,6 +1312,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) break; } break; + case KVM_S390_VM_MIGRATION: + ret = 0; + break; default: ret = -ENXIO; break; @@ -1286,6 +1422,182 @@ out: return r; } +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* for consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + +/* + * This function searches for the next page with dirty CMMA attributes, and + * saves the attributes in the buffer up to either the end of the buffer or + * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found; + * no trailing clean bytes are saved. + * In case no dirty bits were found, or if CMMA was not enabled or used, the + * output buffer will indicate 0 as length. + */ +static int kvm_s390_get_cmma_bits(struct kvm *kvm, + struct kvm_s390_cmma_log *args) +{ + struct kvm_s390_migration_state *s = kvm->arch.migration_state; + unsigned long bufsize, hva, pgstev, i, next, cur; + int srcu_idx, peek, r = 0, rr; + u8 *res; + + cur = args->start_gfn; + i = next = pgstev = 0; + + if (unlikely(!kvm->arch.use_cmma)) + return -ENXIO; + /* Invalid/unsupported flags were specified */ + if (args->flags & ~KVM_S390_CMMA_PEEK) + return -EINVAL; + /* Migration mode query, and we are not doing a migration */ + peek = !!(args->flags & KVM_S390_CMMA_PEEK); + if (!peek && !s) + return -EINVAL; + /* CMMA is disabled or was not used, or the buffer has length zero */ + bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); + if (!bufsize || !kvm->mm->context.use_cmma) { + memset(args, 0, sizeof(*args)); + return 0; + } + + if (!peek) { + /* We are not peeking, and there are no dirty pages */ + if (!atomic64_read(&s->dirty_pages)) { + memset(args, 0, sizeof(*args)); + return 0; + } + cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, + args->start_gfn); + if (cur >= s->bitmap_size) /* nothing found, loop back */ + cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0); + if (cur >= s->bitmap_size) { /* again! (very unlikely) */ + memset(args, 0, sizeof(*args)); + return 0; + } + next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1); + } + + res = vmalloc(bufsize); + if (!res) + return -ENOMEM; + + args->start_gfn = cur; + + down_read(&kvm->mm->mmap_sem); + srcu_idx = srcu_read_lock(&kvm->srcu); + while (i < bufsize) { + hva = gfn_to_hva(kvm, cur); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + /* decrement only if we actually flipped the bit to 0 */ + if (!peek && test_and_clear_bit(cur, s->pgste_bitmap)) + atomic64_dec(&s->dirty_pages); + r = get_pgste(kvm->mm, hva, &pgstev); + if (r < 0) + pgstev = 0; + /* save the value */ + res[i++] = (pgstev >> 24) & 0x3; + /* + * if the next bit is too far away, stop. + * if we reached the previous "next", find the next one + */ + if (!peek) { + if (next > cur + KVM_S390_MAX_BIT_DISTANCE) + break; + if (cur == next) + next = find_next_bit(s->pgste_bitmap, + s->bitmap_size, cur + 1); + /* reached the end of the bitmap or of the buffer, stop */ + if ((next >= s->bitmap_size) || + (next >= args->start_gfn + bufsize)) + break; + } + cur++; + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + up_read(&kvm->mm->mmap_sem); + args->count = i; + args->remaining = s ? atomic64_read(&s->dirty_pages) : 0; + + rr = copy_to_user((void __user *)args->values, res, args->count); + if (rr) + r = -EFAULT; + + vfree(res); + return r; +} + +/* + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.use_cmma flag is set. + */ +static int kvm_s390_set_cmma_bits(struct kvm *kvm, + const struct kvm_s390_cmma_log *args) +{ + unsigned long hva, mask, pgstev, i; + uint8_t *bits; + int srcu_idx, r = 0; + + mask = args->mask; + + if (!kvm->arch.use_cmma) + return -ENXIO; + /* invalid/unsupported flags */ + if (args->flags != 0) + return -EINVAL; + /* Enforce sane limit on memory allocation */ + if (args->count > KVM_S390_CMMA_SIZE_MAX) + return -EINVAL; + /* Nothing to do */ + if (args->count == 0) + return 0; + + bits = vmalloc(sizeof(*bits) * args->count); + if (!bits) + return -ENOMEM; + + r = copy_from_user(bits, (void __user *)args->values, args->count); + if (r) { + r = -EFAULT; + goto out; + } + + down_read(&kvm->mm->mmap_sem); + srcu_idx = srcu_read_lock(&kvm->srcu); + for (i = 0; i < args->count; i++) { + hva = gfn_to_hva(kvm, args->start_gfn + i); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + + pgstev = bits[i]; + pgstev = pgstev << 24; + mask &= _PGSTE_GPS_USAGE_MASK; + set_pgste_bits(kvm->mm, hva, mask, pgstev); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + up_read(&kvm->mm->mmap_sem); + + if (!kvm->mm->context.use_cmma) { + down_write(&kvm->mm->mmap_sem); + kvm->mm->context.use_cmma = 1; + up_write(&kvm->mm->mmap_sem); + } +out: + vfree(bits); + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1364,6 +1676,29 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_set_skeys(kvm, &args); break; } + case KVM_S390_GET_CMMA_BITS: { + struct kvm_s390_cmma_log args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + r = kvm_s390_get_cmma_bits(kvm, &args); + if (!r) { + r = copy_to_user(argp, &args, sizeof(args)); + if (r) + r = -EFAULT; + } + break; + } + case KVM_S390_SET_CMMA_BITS: { + struct kvm_s390_cmma_log args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + r = kvm_s390_set_cmma_bits(kvm, &args); + break; + } default: r = -ENOTTY; } @@ -1633,6 +1968,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); + if (kvm->arch.migration_state) { + vfree(kvm->arch.migration_state->pgste_bitmap); + kfree(kvm->arch.migration_state); + } KVM_EVENT(3, "vm 0x%pK destroyed", kvm); } @@ -1977,7 +2316,6 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) if (!vcpu->arch.sie_block->cbrlo) return -ENOMEM; - vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI; return 0; } @@ -2489,6 +2827,27 @@ retry: goto retry; } + if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) { + /* + * Disable CMMA virtualization; we will emulate the ESSA + * instruction manually, in order to provide additional + * functionalities needed for live migration. + */ + vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA; + goto retry; + } + + if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) { + /* + * Re-enable CMMA virtualization if CMMA is available and + * was used. + */ + if ((vcpu->kvm->arch.use_cmma) && + (vcpu->kvm->mm->context.use_cmma)) + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + goto retry; + } + /* nothing to do, just clear the request */ kvm_clear_request(KVM_REQ_UNHALT, vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c03106c428cf..a226c459809b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -24,6 +24,7 @@ #include <asm/ebcdic.h> #include <asm/sysinfo.h> #include <asm/pgtable.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> #include <asm/gmap.h> #include <asm/io.h> @@ -949,13 +950,72 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return 0; } +static inline int do_essa(struct kvm_vcpu *vcpu, const int orc) +{ + struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state; + int r1, r2, nappended, entries; + unsigned long gfn, hva, res, pgstev, ptev; + unsigned long *cbrlo; + + /* + * We don't need to set SD.FPF.SK to 1 here, because if we have a + * machine check here we either handle it or crash + */ + + kvm_s390_get_regs_rre(vcpu, &r1, &r2); + gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; + hva = gfn_to_hva(vcpu->kvm, gfn); + entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; + + if (kvm_is_error_hva(hva)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); + if (nappended < 0) { + res = orc ? 0x10 : 0; + vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ + return 0; + } + res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; + /* + * Set the block-content state part of the result. 0 means resident, so + * nothing to do if the page is valid. 2 is for preserved pages + * (non-present and non-zero), and 3 for zero pages (non-present and + * zero). + */ + if (ptev & _PAGE_INVALID) { + res |= 2; + if (pgstev & _PGSTE_GPS_ZERO) + res |= 1; + } + vcpu->run->s.regs.gprs[r1] = res; + /* + * It is possible that all the normal 511 slots were full, in which case + * we will now write in the 512th slot, which is reserved for host use. + * In both cases we let the normal essa handling code process all the + * slots, including the reserved one, if needed. + */ + if (nappended > 0) { + cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK); + cbrlo[entries] = gfn << PAGE_SHIFT; + } + + if (orc) { + /* increment only if we are really flipping the bit to 1 */ + if (!test_and_set_bit(gfn, ms->pgste_bitmap)) + atomic64_inc(&ms->dirty_pages); + } + + return nappended; +} + static int handle_essa(struct kvm_vcpu *vcpu) { /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; struct gmap *gmap; - int i; + int i, orc; VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); gmap = vcpu->arch.gmap; @@ -965,12 +1025,45 @@ static int handle_essa(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - - if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6) + /* Check for invalid operation request code */ + orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; + if (orc > ESSA_MAX) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* Retry the ESSA instruction */ - kvm_s390_retry_instr(vcpu); + if (likely(!vcpu->kvm->arch.migration_state)) { + /* + * CMMA is enabled in the KVM settings, but is disabled in + * the SIE block and in the mm_context, and we are not doing + * a migration. Enable CMMA in the mm_context. + * Since we need to take a write lock to write to the context + * to avoid races with storage keys handling, we check if the + * value really needs to be written to; if the value is + * already correct, we do nothing and avoid the lock. + */ + if (vcpu->kvm->mm->context.use_cmma == 0) { + down_write(&vcpu->kvm->mm->mmap_sem); + vcpu->kvm->mm->context.use_cmma = 1; + up_write(&vcpu->kvm->mm->mmap_sem); + } + /* + * If we are here, we are supposed to have CMMA enabled in + * the SIE block. Enabling CMMA works on a per-CPU basis, + * while the context use_cmma flag is per process. + * It's possible that the context flag is enabled and the + * SIE flag is not, so we set the flag always; if it was + * already set, nothing changes, otherwise we enable it + * on this CPU too. + */ + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + /* Retry the ESSA instruction */ + kvm_s390_retry_instr(vcpu); + } else { + /* Account for the possible extra cbrl entry */ + i = do_essa(vcpu, orc); + if (i < 0) + return i; + entries += i; + } vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); down_read(&gmap->mm->mmap_sem); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 577429a95ad8..2b8dc1ca18d4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -155,6 +155,35 @@ struct kvm_s390_skeys { __u32 reserved[9]; }; +#define KVM_S390_CMMA_PEEK (1 << 0) + +/** + * kvm_s390_cmma_log - Used for CMMA migration. + * + * Used both for input and output. + * + * @start_gfn: Guest page number to start from. + * @count: Size of the result buffer. + * @flags: Control operation mode via KVM_S390_CMMA_* flags + * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty + * pages are still remaining. + * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set + * in the PGSTE. + * @values: Pointer to the values buffer. + * + * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. + */ +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 @@ -895,6 +924,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SPAPR_TCE_VFIO 142 #define KVM_CAP_X86_GUEST_MWAIT 143 #define KVM_CAP_ARM_USER_IRQ 144 +#define KVM_CAP_S390_CMMA_MIGRATION 145 #ifdef KVM_CAP_IRQ_ROUTING @@ -1318,6 +1348,9 @@ struct kvm_s390_ucas_mapping { #define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state) /* Available with KVM_CAP_X86_SMM */ #define KVM_SMI _IO(KVMIO, 0xb7) +/* Available with KVM_CAP_S390_CMMA_MIGRATION */ +#define KVM_S390_GET_CMMA_BITS _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log) +#define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) |