diff options
Diffstat (limited to 'arch/s390/kvm/kvm-s390.c')
-rw-r--r-- | arch/s390/kvm/kvm-s390.c | 383 |
1 files changed, 250 insertions, 133 deletions
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3b7a5151b6a5..91ad4a9425c0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -172,6 +172,10 @@ static int nested; module_param(nested, int, S_IRUGO); MODULE_PARM_DESC(nested, "Nested virtualization support"); +/* allow 1m huge page guest backing, if !nested */ +static int hpage; +module_param(hpage, int, 0444); +MODULE_PARM_DESC(hpage, "1m huge page backing support"); /* * For now we handle at most 16 double words as this is what the s390 base @@ -475,6 +479,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_AIS_MIGRATION: r = 1; break; + case KVM_CAP_S390_HPAGE_1M: + r = 0; + if (hpage) + r = 1; + break; case KVM_CAP_S390_MEM_OP: r = MEM_OP_MAX_SIZE; break; @@ -511,19 +520,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) } static void kvm_s390_sync_dirty_log(struct kvm *kvm, - struct kvm_memory_slot *memslot) + struct kvm_memory_slot *memslot) { + int i; gfn_t cur_gfn, last_gfn; - unsigned long address; + unsigned long gaddr, vmaddr; struct gmap *gmap = kvm->arch.gmap; + DECLARE_BITMAP(bitmap, _PAGE_ENTRIES); - /* Loop over all guest pages */ + /* Loop over all guest segments */ + cur_gfn = memslot->base_gfn; last_gfn = memslot->base_gfn + memslot->npages; - for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) { - address = gfn_to_hva_memslot(memslot, cur_gfn); + for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) { + gaddr = gfn_to_gpa(cur_gfn); + vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); + if (kvm_is_error_hva(vmaddr)) + continue; + + bitmap_zero(bitmap, _PAGE_ENTRIES); + gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr); + for (i = 0; i < _PAGE_ENTRIES; i++) { + if (test_bit(i, bitmap)) + mark_page_dirty(kvm, cur_gfn + i); + } - if (test_and_clear_guest_dirty(gmap->mm, address)) - mark_page_dirty(kvm, cur_gfn); if (fatal_signal_pending(current)) return; cond_resched(); @@ -667,6 +687,27 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s", r ? "(not available)" : "(success)"); break; + case KVM_CAP_S390_HPAGE_1M: + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) + r = -EBUSY; + else if (!hpage || kvm->arch.use_cmma) + r = -EINVAL; + else { + r = 0; + kvm->mm->context.allow_gmap_hpage_1m = 1; + /* + * We might have to create fake 4k page + * tables. To avoid that the hardware works on + * stale PGSTEs, we emulate these instructions. + */ + kvm->arch.use_skf = 0; + kvm->arch.use_pfmfi = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_HPAGE %s", + r ? "(not available)" : "(success)"); + break; case KVM_CAP_S390_USER_STSI: VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI"); kvm->arch.user_stsi = 1; @@ -714,10 +755,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!sclp.has_cmma) break; - ret = -EBUSY; VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support"); mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { + if (kvm->created_vcpus) + ret = -EBUSY; + else if (kvm->mm->context.allow_gmap_hpage_1m) + ret = -EINVAL; + else { kvm->arch.use_cmma = 1; /* Not compatible with cmma. */ kvm->arch.use_pfmfi = 0; @@ -862,54 +906,37 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) */ static int kvm_s390_vm_start_migration(struct kvm *kvm) { - struct kvm_s390_migration_state *mgs; struct kvm_memory_slot *ms; - /* should be the only one */ struct kvm_memslots *slots; - unsigned long ram_pages; + unsigned long ram_pages = 0; int slotnr; /* migration mode already enabled */ - if (kvm->arch.migration_state) + if (kvm->arch.migration_mode) return 0; - slots = kvm_memslots(kvm); if (!slots || !slots->used_slots) return -EINVAL; - mgs = kzalloc(sizeof(*mgs), GFP_KERNEL); - if (!mgs) - return -ENOMEM; - kvm->arch.migration_state = mgs; - - if (kvm->arch.use_cmma) { + if (!kvm->arch.use_cmma) { + kvm->arch.migration_mode = 1; + return 0; + } + /* mark all the pages in active slots as dirty */ + for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { + ms = slots->memslots + slotnr; /* - * Get the first slot. They are reverse sorted by base_gfn, so - * the first slot is also the one at the end of the address - * space. We have verified above that at least one slot is - * present. + * The second half of the bitmap is only used on x86, + * and would be wasted otherwise, so we put it to good + * use here to keep track of the state of the storage + * attributes. */ - ms = slots->memslots; - /* round up so we only use full longs */ - ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG); - /* allocate enough bytes to store all the bits */ - mgs->pgste_bitmap = vmalloc(ram_pages / 8); - if (!mgs->pgste_bitmap) { - kfree(mgs); - kvm->arch.migration_state = NULL; - return -ENOMEM; - } - - mgs->bitmap_size = ram_pages; - atomic64_set(&mgs->dirty_pages, ram_pages); - /* mark all the pages in active slots as dirty */ - for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { - ms = slots->memslots + slotnr; - bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages); - } - - kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); + memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms)); + ram_pages += ms->npages; } + atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); + kvm->arch.migration_mode = 1; + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); return 0; } @@ -919,21 +946,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) */ static int kvm_s390_vm_stop_migration(struct kvm *kvm) { - struct kvm_s390_migration_state *mgs; - /* migration mode already disabled */ - if (!kvm->arch.migration_state) + if (!kvm->arch.migration_mode) return 0; - mgs = kvm->arch.migration_state; - kvm->arch.migration_state = NULL; - - if (kvm->arch.use_cmma) { + kvm->arch.migration_mode = 0; + if (kvm->arch.use_cmma) kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); - /* We have to wait for the essa emulation to finish */ - synchronize_srcu(&kvm->srcu); - vfree(mgs->pgste_bitmap); - } - kfree(mgs); return 0; } @@ -961,7 +979,7 @@ static int kvm_s390_vm_set_migration(struct kvm *kvm, static int kvm_s390_vm_get_migration(struct kvm *kvm, struct kvm_device_attr *attr) { - u64 mig = (kvm->arch.migration_state != NULL); + u64 mig = kvm->arch.migration_mode; if (attr->attr != KVM_S390_VM_MIGRATION_STATUS) return -ENXIO; @@ -1540,6 +1558,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) uint8_t *keys; uint64_t hva; int srcu_idx, i, r = 0; + bool unlocked; if (args->flags != 0) return -EINVAL; @@ -1564,9 +1583,11 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (r) goto out; + i = 0; down_read(¤t->mm->mmap_sem); srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { + while (i < args->count) { + unlocked = false; hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { r = -EFAULT; @@ -1580,8 +1601,14 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) } r = set_guest_storage_key(current->mm, hva, keys[i], 0); - if (r) - break; + if (r) { + r = fixup_user_fault(current, current->mm, hva, + FAULT_FLAG_WRITE, &unlocked); + if (r) + break; + } + if (!r) + i++; } srcu_read_unlock(&kvm->srcu, srcu_idx); up_read(¤t->mm->mmap_sem); @@ -1600,6 +1627,134 @@ out: #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) /* + * Similar to gfn_to_memslot, but returns the index of a memslot also when the + * address falls in a hole. In that case the index of one of the memslots + * bordering the hole is returned. + */ +static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) +{ + int start = 0, end = slots->used_slots; + int slot = atomic_read(&slots->lru_slot); + struct kvm_memory_slot *memslots = slots->memslots; + + if (gfn >= memslots[slot].base_gfn && + gfn < memslots[slot].base_gfn + memslots[slot].npages) + return slot; + + while (start < end) { + slot = start + (end - start) / 2; + + if (gfn >= memslots[slot].base_gfn) + end = slot; + else + start = slot + 1; + } + + if (gfn >= memslots[start].base_gfn && + gfn < memslots[start].base_gfn + memslots[start].npages) { + atomic_set(&slots->lru_slot, start); + } + + return start; +} + +static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, + u8 *res, unsigned long bufsize) +{ + unsigned long pgstev, hva, cur_gfn = args->start_gfn; + + args->count = 0; + while (args->count < bufsize) { + hva = gfn_to_hva(kvm, cur_gfn); + /* + * We return an error if the first value was invalid, but we + * return successfully if at least one value was copied. + */ + if (kvm_is_error_hva(hva)) + return args->count ? 0 : -EFAULT; + if (get_pgste(kvm->mm, hva, &pgstev) < 0) + pgstev = 0; + res[args->count++] = (pgstev >> 24) & 0x43; + cur_gfn++; + } + + return 0; +} + +static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, + unsigned long cur_gfn) +{ + int slotidx = gfn_to_memslot_approx(slots, cur_gfn); + struct kvm_memory_slot *ms = slots->memslots + slotidx; + unsigned long ofs = cur_gfn - ms->base_gfn; + + if (ms->base_gfn + ms->npages <= cur_gfn) { + slotidx--; + /* If we are above the highest slot, wrap around */ + if (slotidx < 0) + slotidx = slots->used_slots - 1; + + ms = slots->memslots + slotidx; + ofs = 0; + } + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); + while ((slotidx > 0) && (ofs >= ms->npages)) { + slotidx--; + ms = slots->memslots + slotidx; + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0); + } + return ms->base_gfn + ofs; +} + +static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, + u8 *res, unsigned long bufsize) +{ + unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *ms; + + cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); + ms = gfn_to_memslot(kvm, cur_gfn); + args->count = 0; + args->start_gfn = cur_gfn; + if (!ms) + return 0; + next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); + mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages; + + while (args->count < bufsize) { + hva = gfn_to_hva(kvm, cur_gfn); + if (kvm_is_error_hva(hva)) + return 0; + /* Decrement only if we actually flipped the bit to 0 */ + if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) + atomic64_dec(&kvm->arch.cmma_dirty_pages); + if (get_pgste(kvm->mm, hva, &pgstev) < 0) + pgstev = 0; + /* Save the value */ + res[args->count++] = (pgstev >> 24) & 0x43; + /* If the next bit is too far away, stop. */ + if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE) + return 0; + /* If we reached the previous "next", find the next one */ + if (cur_gfn == next_gfn) + next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); + /* Reached the end of memory or of the buffer, stop */ + if ((next_gfn >= mem_end) || + (next_gfn - args->start_gfn >= bufsize)) + return 0; + cur_gfn++; + /* Reached the end of the current memslot, take the next one. */ + if (cur_gfn - ms->base_gfn >= ms->npages) { + ms = gfn_to_memslot(kvm, cur_gfn); + if (!ms) + return 0; + } + } + return 0; +} + +/* * This function searches for the next page with dirty CMMA attributes, and * saves the attributes in the buffer up to either the end of the buffer or * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found; @@ -1610,22 +1765,18 @@ out: static int kvm_s390_get_cmma_bits(struct kvm *kvm, struct kvm_s390_cmma_log *args) { - struct kvm_s390_migration_state *s = kvm->arch.migration_state; - unsigned long bufsize, hva, pgstev, i, next, cur; - int srcu_idx, peek, r = 0, rr; - u8 *res; - - cur = args->start_gfn; - i = next = pgstev = 0; + unsigned long bufsize; + int srcu_idx, peek, ret; + u8 *values; - if (unlikely(!kvm->arch.use_cmma)) + if (!kvm->arch.use_cmma) return -ENXIO; /* Invalid/unsupported flags were specified */ if (args->flags & ~KVM_S390_CMMA_PEEK) return -EINVAL; /* Migration mode query, and we are not doing a migration */ peek = !!(args->flags & KVM_S390_CMMA_PEEK); - if (!peek && !s) + if (!peek && !kvm->arch.migration_mode) return -EINVAL; /* CMMA is disabled or was not used, or the buffer has length zero */ bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); @@ -1633,74 +1784,35 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, memset(args, 0, sizeof(*args)); return 0; } - - if (!peek) { - /* We are not peeking, and there are no dirty pages */ - if (!atomic64_read(&s->dirty_pages)) { - memset(args, 0, sizeof(*args)); - return 0; - } - cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, - args->start_gfn); - if (cur >= s->bitmap_size) /* nothing found, loop back */ - cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0); - if (cur >= s->bitmap_size) { /* again! (very unlikely) */ - memset(args, 0, sizeof(*args)); - return 0; - } - next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1); + /* We are not peeking, and there are no dirty pages */ + if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) { + memset(args, 0, sizeof(*args)); + return 0; } - res = vmalloc(bufsize); - if (!res) + values = vmalloc(bufsize); + if (!values) return -ENOMEM; - args->start_gfn = cur; - down_read(&kvm->mm->mmap_sem); srcu_idx = srcu_read_lock(&kvm->srcu); - while (i < bufsize) { - hva = gfn_to_hva(kvm, cur); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; - } - /* decrement only if we actually flipped the bit to 0 */ - if (!peek && test_and_clear_bit(cur, s->pgste_bitmap)) - atomic64_dec(&s->dirty_pages); - r = get_pgste(kvm->mm, hva, &pgstev); - if (r < 0) - pgstev = 0; - /* save the value */ - res[i++] = (pgstev >> 24) & 0x43; - /* - * if the next bit is too far away, stop. - * if we reached the previous "next", find the next one - */ - if (!peek) { - if (next > cur + KVM_S390_MAX_BIT_DISTANCE) - break; - if (cur == next) - next = find_next_bit(s->pgste_bitmap, - s->bitmap_size, cur + 1); - /* reached the end of the bitmap or of the buffer, stop */ - if ((next >= s->bitmap_size) || - (next >= args->start_gfn + bufsize)) - break; - } - cur++; - } + if (peek) + ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); + else + ret = kvm_s390_get_cmma(kvm, args, values, bufsize); srcu_read_unlock(&kvm->srcu, srcu_idx); up_read(&kvm->mm->mmap_sem); - args->count = i; - args->remaining = s ? atomic64_read(&s->dirty_pages) : 0; - rr = copy_to_user((void __user *)args->values, res, args->count); - if (rr) - r = -EFAULT; + if (kvm->arch.migration_mode) + args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); + else + args->remaining = 0; - vfree(res); - return r; + if (copy_to_user((void __user *)args->values, values, args->count)) + ret = -EFAULT; + + vfree(values); + return ret; } /* @@ -2139,10 +2251,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); - if (kvm->arch.migration_state) { - vfree(kvm->arch.migration_state->pgste_bitmap); - kfree(kvm->arch.migration_state); - } KVM_EVENT(3, "vm 0x%pK destroyed", kvm); } @@ -2300,6 +2408,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; if (test_kvm_facility(vcpu->kvm, 133)) vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; + if (test_kvm_facility(vcpu->kvm, 156)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; /* fprs can be synchronized via vrs, even if the guest has no vx. With * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. */ @@ -2549,7 +2659,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) } if (test_kvm_facility(vcpu->kvm, 139)) vcpu->arch.sie_block->ecd |= ECD_MEF; - + if (test_kvm_facility(vcpu->kvm, 156)) + vcpu->arch.sie_block->ecd |= ECD_ETOKENF; if (vcpu->arch.sie_block->gd) { vcpu->arch.sie_block->eca |= ECA_AIV; VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", @@ -3467,6 +3578,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } preempt_enable(); } + /* SIE will load etoken directly from SDNX and therefore kvm_run */ kvm_run->kvm_dirty_regs = 0; } @@ -3506,7 +3618,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) __ctl_clear_bit(2, 4); vcpu->arch.host_gscb = NULL; } - + /* SIE will save etoken directly into SDNX and therefore kvm_run */ } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -4082,6 +4194,11 @@ static int __init kvm_s390_init(void) return -ENODEV; } + if (nested && hpage) { + pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently"); + return -EINVAL; + } + for (i = 0; i < 16; i++) kvm_s390_fac_base[i] |= S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i); |