diff options
Diffstat (limited to 'arch/powerpc/kvm')
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu.c | 1 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu_hv.c | 255 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu_radix.c | 51 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_slb.S | 2 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv.c | 368 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_builtin.c | 117 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rm_mmu.c | 65 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rmhandlers.S | 212 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_pr.c | 18 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_pr_papr.c | 2 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_rmhandlers.S | 7 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_segment.S | 4 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_xive.c | 7 | ||||
-rw-r--r-- | arch/powerpc/kvm/booke.c | 7 | ||||
-rw-r--r-- | arch/powerpc/kvm/e500_mmu_host.c | 2 | ||||
-rw-r--r-- | arch/powerpc/kvm/powerpc.c | 144 |
16 files changed, 905 insertions, 357 deletions
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 29ebe2fd5867..a93d719edc90 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -235,6 +235,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_read = true; gpte->may_write = true; gpte->page_size = MMU_PAGE_4K; + gpte->wimg = HPTE_R_M; return 0; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 59247af5fd45..b73dbc9e797d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -65,16 +65,20 @@ struct kvm_resize_hpt { u32 order; /* These fields protected by kvm->lock */ + + /* Possible values and their usage: + * <0 an error occurred during allocation, + * -EBUSY allocation is in the progress, + * 0 allocation made successfuly. + */ int error; - bool prepare_done; - /* Private to the work thread, until prepare_done is true, - * then protected by kvm->resize_hpt_sem */ + /* Private to the work thread, until error != -EBUSY, + * then protected by kvm->lock. + */ struct kvm_hpt_info hpt; }; -static void kvmppc_rmap_reset(struct kvm *kvm); - int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) { unsigned long hpt = 0; @@ -106,7 +110,6 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) /* Allocate reverse map array */ rev = vmalloc(sizeof(struct revmap_entry) * npte); if (!rev) { - pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); if (cma) kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); else @@ -137,19 +140,22 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) long err = -EBUSY; struct kvm_hpt_info info; - if (kvm_is_radix(kvm)) - return -EINVAL; - mutex_lock(&kvm->lock); - if (kvm->arch.hpte_setup_done) { - kvm->arch.hpte_setup_done = 0; - /* order hpte_setup_done vs. vcpus_running */ + if (kvm->arch.mmu_ready) { + kvm->arch.mmu_ready = 0; + /* order mmu_ready vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; goto out; } } + if (kvm_is_radix(kvm)) { + err = kvmppc_switch_mmu_to_hpt(kvm); + if (err) + goto out; + } + if (kvm->arch.hpt.order == order) { /* We already have a suitable HPT */ @@ -159,8 +165,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) * Reset all the reverse-mapping chains for all memslots */ kvmppc_rmap_reset(kvm); - /* Ensure that each vcpu will flush its TLB on next entry. */ - cpumask_setall(&kvm->arch.need_tlb_flush); err = 0; goto out; } @@ -176,6 +180,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) kvmppc_set_hpt(kvm, &info); out: + if (err == 0) + /* Ensure that each vcpu will flush its TLB on next entry. */ + cpumask_setall(&kvm->arch.need_tlb_flush); + mutex_unlock(&kvm->lock); return err; } @@ -183,6 +191,7 @@ out: void kvmppc_free_hpt(struct kvm_hpt_info *info) { vfree(info->rev); + info->rev = NULL; if (info->cma) kvm_free_hpt_cma(virt_to_page(info->virt), 1 << (info->order - PAGE_SHIFT)); @@ -334,7 +343,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, { unsigned long ra_mask; - ra_mask = hpte_page_size(v, r) - 1; + ra_mask = kvmppc_actual_pgsz(v, r) - 1; return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); } @@ -350,6 +359,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + if (kvm_is_radix(vcpu->kvm)) + return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); + /* Get SLB entry */ if (virtmode) { slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); @@ -505,7 +517,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, mmio_update = atomic64_read(&kvm->arch.mmio_update); if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { r = vcpu->arch.pgfault_cache->rpte; - psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); + psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], + r); gpa_base = r & HPTE_R_RPN & ~(psize - 1); gfn_base = gpa_base >> PAGE_SHIFT; gpa = gpa_base | (ea & (psize - 1)); @@ -534,7 +547,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; /* Translate the logical address and get the page */ - psize = hpte_page_size(hpte[0], r); + psize = kvmppc_actual_pgsz(hpte[0], r); gpa_base = r & HPTE_R_RPN & ~(psize - 1); gfn_base = gpa_base >> PAGE_SHIFT; gpa = gpa_base | (ea & (psize - 1)); @@ -650,10 +663,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* * If the HPT is being resized, don't update the HPTE, * instead let the guest retry after the resize operation is complete. - * The synchronization for hpte_setup_done test vs. set is provided + * The synchronization for mmu_ready test vs. set is provided * by the HPTE lock. */ - if (!kvm->arch.hpte_setup_done) + if (!kvm->arch.mmu_ready) goto out_unlock; if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || @@ -720,7 +733,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; } -static void kvmppc_rmap_reset(struct kvm *kvm) +void kvmppc_rmap_reset(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -786,6 +799,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, /* Must be called with both HPTE and rmap locked */ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, + struct kvm_memory_slot *memslot, unsigned long *rmapp, unsigned long gfn) { __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); @@ -808,7 +822,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, /* Now check and modify the HPTE */ ptel = rev[i].guest_rpte; - psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); @@ -817,8 +831,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, /* Harvest R and C */ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; - if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmapp, psize); + if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, psize); if (rcbits & ~rev[i].guest_rpte) { rev[i].guest_rpte = ptel | rcbits; note_hpte_modification(kvm, &rev[i]); @@ -856,7 +870,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, continue; } - kvmppc_unmap_hpte(kvm, i, rmapp, gfn); + kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } @@ -1039,14 +1053,6 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) retry: lock_rmap(rmapp); - if (*rmapp & KVMPPC_RMAP_CHANGED) { - long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER) - >> KVMPPC_RMAP_CHG_SHIFT; - *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER); - npages_dirty = 1; - if (change_order > PAGE_SHIFT) - npages_dirty = 1ul << (change_order - PAGE_SHIFT); - } if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { unlock_rmap(rmapp); return npages_dirty; @@ -1102,7 +1108,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) rev[i].guest_rpte |= HPTE_R_C; note_hpte_modification(kvm, &rev[i]); } - n = hpte_page_size(v, r); + n = kvmppc_actual_pgsz(v, r); n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; if (n > npages_dirty) npages_dirty = n; @@ -1138,7 +1144,7 @@ void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map) { - unsigned long i, j; + unsigned long i; unsigned long *rmapp; preempt_disable(); @@ -1150,9 +1156,8 @@ long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, * since we always put huge-page HPTEs in the rmap chain * corresponding to their page base address. */ - if (npages && map) - for (j = i; npages; ++j, --npages) - __set_bit_le(j, map); + if (npages) + set_dirty_bits(map, i, npages); ++rmapp; } preempt_enable(); @@ -1196,7 +1201,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, struct page *page = virt_to_page(va); struct kvm_memory_slot *memslot; unsigned long gfn; - unsigned long *rmap; int srcu_idx; put_page(page); @@ -1204,20 +1208,12 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, if (!dirty) return; - /* We need to mark this page dirty in the rmap chain */ + /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ gfn = gpa >> PAGE_SHIFT; srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, gfn); - if (memslot) { - if (!kvm_is_radix(kvm)) { - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - unlock_rmap(rmap); - } else if (memslot->dirty_bitmap) { - mark_page_dirty(kvm, gfn); - } - } + if (memslot && memslot->dirty_bitmap) + set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); srcu_read_unlock(&kvm->srcu, srcu_idx); } @@ -1250,8 +1246,9 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, unsigned long vpte, rpte, guest_rpte; int ret; struct revmap_entry *rev; - unsigned long apsize, psize, avpn, pteg, hash; + unsigned long apsize, avpn, pteg, hash; unsigned long new_idx, new_pteg, replace_vpte; + int pshift; hptep = (__be64 *)(old->virt + (idx << 4)); @@ -1277,7 +1274,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, guest_rpte = rev->guest_rpte; ret = -EIO; - apsize = hpte_page_size(vpte, guest_rpte); + apsize = kvmppc_actual_pgsz(vpte, guest_rpte); if (!apsize) goto out; @@ -1292,7 +1289,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; lock_rmap(rmapp); - kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); + kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); unlock_rmap(rmapp); } @@ -1310,8 +1307,8 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, goto out; rpte = be64_to_cpu(hptep[1]); - psize = hpte_base_page_size(vpte, rpte); - avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23); + pshift = kvmppc_hpte_base_page_shift(vpte, rpte); + avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23); pteg = idx / HPTES_PER_GROUP; if (vpte & HPTE_V_SECONDARY) pteg = ~pteg; @@ -1323,20 +1320,20 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, offset = (avpn & 0x1f) << 23; vsid = avpn >> 5; /* We can find more bits from the pteg value */ - if (psize < (1ULL << 23)) - offset |= ((vsid ^ pteg) & old_hash_mask) * psize; + if (pshift < 23) + offset |= ((vsid ^ pteg) & old_hash_mask) << pshift; - hash = vsid ^ (offset / psize); + hash = vsid ^ (offset >> pshift); } else { unsigned long offset, vsid; /* We only have 40 - 23 bits of seg_off in avpn */ offset = (avpn & 0x1ffff) << 23; vsid = avpn >> 17; - if (psize < (1ULL << 23)) - offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize; + if (pshift < 23) + offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift; - hash = vsid ^ (vsid << 25) ^ (offset / psize); + hash = vsid ^ (vsid << 25) ^ (offset >> pshift); } new_pteg = hash & new_hash_mask; @@ -1424,16 +1421,20 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize) static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { - BUG_ON(kvm->arch.resize_hpt != resize); + if (WARN_ON(!mutex_is_locked(&kvm->lock))) + return; if (!resize) return; - if (resize->hpt.virt) - kvmppc_free_hpt(&resize->hpt); + if (resize->error != -EBUSY) { + if (resize->hpt.virt) + kvmppc_free_hpt(&resize->hpt); + kfree(resize); + } - kvm->arch.resize_hpt = NULL; - kfree(resize); + if (kvm->arch.resize_hpt == resize) + kvm->arch.resize_hpt = NULL; } static void resize_hpt_prepare_work(struct work_struct *work) @@ -1442,17 +1443,41 @@ static void resize_hpt_prepare_work(struct work_struct *work) struct kvm_resize_hpt, work); struct kvm *kvm = resize->kvm; - int err; + int err = 0; - resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", - resize->order); - - err = resize_hpt_allocate(resize); + if (WARN_ON(resize->error != -EBUSY)) + return; mutex_lock(&kvm->lock); + /* Request is still current? */ + if (kvm->arch.resize_hpt == resize) { + /* We may request large allocations here: + * do not sleep with kvm->lock held for a while. + */ + mutex_unlock(&kvm->lock); + + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", + resize->order); + + err = resize_hpt_allocate(resize); + + /* We have strict assumption about -EBUSY + * when preparing for HPT resize. + */ + if (WARN_ON(err == -EBUSY)) + err = -EINPROGRESS; + + mutex_lock(&kvm->lock); + /* It is possible that kvm->arch.resize_hpt != resize + * after we grab kvm->lock again. + */ + } + resize->error = err; - resize->prepare_done = true; + + if (kvm->arch.resize_hpt != resize) + resize_hpt_release(kvm, resize); mutex_unlock(&kvm->lock); } @@ -1465,7 +1490,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, struct kvm_resize_hpt *resize; int ret; - if (flags != 0) + if (flags != 0 || kvm_is_radix(kvm)) return -EINVAL; if (shift && ((shift < 18) || (shift > 46))) @@ -1477,14 +1502,12 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, if (resize) { if (resize->order == shift) { - /* Suitable resize in progress */ - if (resize->prepare_done) { - ret = resize->error; - if (ret != 0) - resize_hpt_release(kvm, resize); - } else { + /* Suitable resize in progress? */ + ret = resize->error; + if (ret == -EBUSY) ret = 100; /* estimated time in ms */ - } + else if (ret) + resize_hpt_release(kvm, resize); goto out; } @@ -1504,6 +1527,8 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, ret = -ENOMEM; goto out; } + + resize->error = -EBUSY; resize->order = shift; resize->kvm = kvm; INIT_WORK(&resize->work, resize_hpt_prepare_work); @@ -1531,7 +1556,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, struct kvm_resize_hpt *resize; long ret; - if (flags != 0) + if (flags != 0 || kvm_is_radix(kvm)) return -EINVAL; if (shift && ((shift < 18) || (shift > 46))) @@ -1543,38 +1568,34 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, /* This shouldn't be possible */ ret = -EIO; - if (WARN_ON(!kvm->arch.hpte_setup_done)) + if (WARN_ON(!kvm->arch.mmu_ready)) goto out_no_hpt; /* Stop VCPUs from running while we mess with the HPT */ - kvm->arch.hpte_setup_done = 0; + kvm->arch.mmu_ready = 0; smp_mb(); /* Boot all CPUs out of the guest so they re-read - * hpte_setup_done */ + * mmu_ready */ on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); ret = -ENXIO; if (!resize || (resize->order != shift)) goto out; - ret = -EBUSY; - if (!resize->prepare_done) - goto out; - ret = resize->error; - if (ret != 0) + if (ret) goto out; ret = resize_hpt_rehash(resize); - if (ret != 0) + if (ret) goto out; resize_hpt_pivot(resize); out: /* Let VCPUs run again */ - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; smp_mb(); out_no_hpt: resize_hpt_release(kvm, resize); @@ -1717,6 +1738,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, if (!access_ok(VERIFY_WRITE, buf, count)) return -EFAULT; + if (kvm_is_radix(kvm)) + return 0; first_pass = ctx->first_pass; flags = ctx->flags; @@ -1810,20 +1833,23 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, unsigned long tmp[2]; ssize_t nb; long int err, ret; - int hpte_setup; + int mmu_ready; + int pshift; if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; + if (kvm_is_radix(kvm)) + return -EINVAL; /* lock out vcpus from running while we're doing this */ mutex_lock(&kvm->lock); - hpte_setup = kvm->arch.hpte_setup_done; - if (hpte_setup) { - kvm->arch.hpte_setup_done = 0; /* temporarily */ - /* order hpte_setup_done vs. vcpus_running */ + mmu_ready = kvm->arch.mmu_ready; + if (mmu_ready) { + kvm->arch.mmu_ready = 0; /* temporarily */ + /* order mmu_ready vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; mutex_unlock(&kvm->lock); return -EBUSY; } @@ -1863,6 +1889,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, err = -EINVAL; if (!(v & HPTE_V_VALID)) goto out; + pshift = kvmppc_hpte_base_page_shift(v, r); + if (pshift <= 0) + goto out; lbuf += 2; nb += HPTE_SIZE; @@ -1876,16 +1905,20 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, "r=%lx\n", ret, i, v, r); goto out; } - if (!hpte_setup && is_vrma_hpte(v)) { - unsigned long psize = hpte_base_page_size(v, r); - unsigned long senc = slb_pgsize_encoding(psize); - unsigned long lpcr; + if (!mmu_ready && is_vrma_hpte(v)) { + unsigned long senc, lpcr; + senc = slb_pgsize_encoding(1ul << pshift); kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - lpcr = senc << (LPCR_VRMASD_SH - 4); - kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - hpte_setup = 1; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + lpcr = senc << (LPCR_VRMASD_SH - 4); + kvmppc_update_lpcr(kvm, lpcr, + LPCR_VRMASD); + } else { + kvmppc_setup_partition_table(kvm); + } + mmu_ready = 1; } ++i; hptp += 2; @@ -1901,9 +1934,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, } out: - /* Order HPTE updates vs. hpte_setup_done */ + /* Order HPTE updates vs. mmu_ready */ smp_wmb(); - kvm->arch.hpte_setup_done = hpte_setup; + kvm->arch.mmu_ready = mmu_ready; mutex_unlock(&kvm->lock); if (err) @@ -2012,6 +2045,10 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, struct kvm *kvm; __be64 *hptp; + kvm = p->kvm; + if (kvm_is_radix(kvm)) + return 0; + ret = mutex_lock_interruptible(&p->mutex); if (ret) return ret; @@ -2034,7 +2071,6 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, } } - kvm = p->kvm; i = p->hpt_index; hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); @@ -2109,10 +2145,7 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ - if (kvm_is_radix(vcpu->kvm)) - mmu->xlate = kvmppc_mmu_radix_xlate; - else - mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index c5d7435455f1..58618f644c56 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -474,26 +474,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return ret; } -static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn, unsigned int order) -{ - unsigned long i, limit; - unsigned long *dp; - - if (!memslot->dirty_bitmap) - return; - limit = 1ul << order; - if (limit < BITS_PER_LONG) { - for (i = 0; i < limit; ++i) - mark_page_dirty(kvm, gfn + i); - return; - } - dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn); - limit /= BITS_PER_LONG; - for (i = 0; i < limit; ++i) - *dp++ = ~0ul; -} - /* Called with kvm->lock held */ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) @@ -508,12 +488,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift); - if (old & _PAGE_DIRTY) { - if (!shift) - mark_page_dirty(kvm, gfn); - else - mark_pages_dirty(kvm, memslot, - gfn, shift - PAGE_SHIFT); + if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { + unsigned long npages = 1; + if (shift) + npages = 1ul << (shift - PAGE_SHIFT); + kvmppc_update_dirty_map(memslot, gfn, npages); } } return 0; @@ -579,20 +558,8 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map) { unsigned long i, j; - unsigned long n, *p; int npages; - /* - * Radix accumulates dirty bits in the first half of the - * memslot's dirty_bitmap area, for when pages are paged - * out or modified by the host directly. Pick up these - * bits and add them to the map. - */ - n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long); - p = memslot->dirty_bitmap; - for (i = 0; i < n; ++i) - map[i] |= xchg(&p[i], 0); - for (i = 0; i < memslot->npages; i = j) { npages = kvm_radix_test_clear_dirty(kvm, memslot, i); @@ -604,9 +571,10 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, * real address, if npages > 1 we can skip to i + npages. */ j = i + 1; - if (npages) - for (j = i; npages; ++j, --npages) - __set_bit_le(j, map); + if (npages) { + set_dirty_bits(map, i, npages); + i = j + npages; + } } return 0; } @@ -694,6 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm) pgd_clear(pgd); } pgd_free(kvm->mm, kvm->arch.pgtable); + kvm->arch.pgtable = NULL; } static void pte_ctor(void *addr) diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 3589c4e3d49b..688722acd692 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -113,7 +113,7 @@ slb_do_enter: /* Remove all SLB entries that are in use. */ - li r0, r0 + li r0, 0 slbmte r0, r0 slbia diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8d43cf205d34..2d46037ce936 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -19,6 +19,7 @@ */ #include <linux/kvm_host.h> +#include <linux/kernel.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/preempt.h> @@ -47,6 +48,7 @@ #include <asm/reg.h> #include <asm/ppc-opcode.h> +#include <asm/asm-prototypes.h> #include <asm/disassemble.h> #include <asm/cputable.h> #include <asm/cacheflush.h> @@ -97,6 +99,10 @@ static int target_smt_mode; module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); +static bool indep_threads_mode = true; +module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); + #ifdef CONFIG_KVM_XICS static struct kernel_param_ops module_param_ops = { .set = param_set_int, @@ -1089,9 +1095,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; - /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/ + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ case BOOK3S_INTERRUPT_HMI: case BOOK3S_INTERRUPT_PERFMON: + case BOOK3S_INTERRUPT_SYSTEM_RESET: r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: @@ -1732,9 +1739,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, * MMU mode (radix or HPT), unfortunately, but since we only support * HPT guests on a HPT host so far, that isn't an impediment yet. */ -static int threads_per_vcore(void) +static int threads_per_vcore(struct kvm *kvm) { - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (kvm->arch.threads_indep) return 1; return threads_per_subcore; } @@ -1772,7 +1779,7 @@ static struct debugfs_timings_element { {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, }; -#define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) +#define N_TIMINGS (ARRAY_SIZE(timings)) struct debugfs_timings_state { struct kvm_vcpu *vcpu; @@ -2117,15 +2124,6 @@ static int kvmppc_grab_hwthread(int cpu) struct paca_struct *tpaca; long timeout = 10000; - /* - * ISA v3.0 idle routines do not set hwthread_state or test - * hwthread_req, so they can not grab idle threads. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - WARN(1, "KVM: can not control sibling threads\n"); - return -EBUSY; - } - tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ @@ -2160,12 +2158,10 @@ static void kvmppc_release_hwthread(int cpu) struct paca_struct *tpaca; tpaca = &paca[cpu]; + tpaca->kvm_hstate.hwthread_req = 0; tpaca->kvm_hstate.kvm_vcpu = NULL; tpaca->kvm_hstate.kvm_vcore = NULL; tpaca->kvm_hstate.kvm_split_mode = NULL; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - tpaca->kvm_hstate.hwthread_req = 0; - } static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) @@ -2237,11 +2233,10 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) kvmppc_ipi_thread(cpu); } -static void kvmppc_wait_for_nap(void) +static void kvmppc_wait_for_nap(int n_threads) { int cpu = smp_processor_id(); int i, loops; - int n_threads = threads_per_vcore(); if (n_threads <= 1) return; @@ -2328,7 +2323,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_PREEMPT; vc->pcpu = smp_processor_id(); - if (vc->num_threads < threads_per_vcore()) { + if (vc->num_threads < threads_per_vcore(vc->kvm)) { spin_lock(&lp->lock); list_add_tail(&vc->preempt_list, &lp->list); spin_unlock(&lp->lock); @@ -2366,7 +2361,7 @@ struct core_info { /* * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 - * respectively in 2-way micro-threading (split-core) mode. + * respectively in 2-way micro-threading (split-core) mode on POWER8. */ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; @@ -2382,7 +2377,14 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) static bool subcore_config_ok(int n_subcores, int n_threads) { - /* Can only dynamically split if unsplit to begin with */ + /* + * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core + * mode, with one thread per subcore. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return n_subcores <= 4 && n_threads == 1; + + /* On POWER8, can only dynamically split if unsplit to begin with */ if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) return false; if (n_subcores > MAX_SUBCORES) @@ -2413,6 +2415,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return false; + /* POWER9 currently requires all threads to be in the same MMU mode */ + if (cpu_has_feature(CPU_FTR_ARCH_300) && + kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) + return false; + if (n_threads < cip->max_subcore_threads) n_threads = cip->max_subcore_threads; if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) @@ -2615,6 +2622,9 @@ static void set_irq_happened(int trap) case BOOK3S_INTERRUPT_HMI: local_paca->irq_happened |= PACA_IRQ_HMI; break; + case BOOK3S_INTERRUPT_SYSTEM_RESET: + replay_system_reset(); + break; } } @@ -2638,6 +2648,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int target_threads; int controlled_threads; int trap; + bool is_power8; + bool hpt_on_radix; /* * Remove from the list any threads that have a signal pending @@ -2660,15 +2672,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * the number of threads per subcore, except on POWER9, * where it's 1 because the threads are (mostly) independent. */ - controlled_threads = threads_per_vcore(); + controlled_threads = threads_per_vcore(vc->kvm); /* * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. + * On POWER9, we need to be not in independent-threads mode if + * this is a HPT guest on a radix host. */ - if ((controlled_threads > 1) && - ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { + hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm); + if (((controlled_threads > 1) && + ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || + (hpt_on_radix && vc->kvm->arch.threads_indep)) { for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; kvmppc_remove_runnable(vc, vcpu); @@ -2705,14 +2721,13 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * Hard-disable interrupts, and check resched flag and signals. * If we need to reschedule or deliver a signal, clean up * and return without going into the guest(s). - * If the hpte_setup_done flag has been cleared, don't go into the + * If the mmu_ready flag has been cleared, don't go into the * guest because that means a HPT resize operation is in progress. */ local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info) || - (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) { + recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ @@ -2734,32 +2749,51 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cmd_bit = stat_bit = 0; split = core_info.n_subcores; sip = NULL; - if (split > 1) { - /* threads_per_subcore must be MAX_SMT_THREADS (8) here */ - if (split == 2 && (dynamic_mt_modes & 2)) { - cmd_bit = HID0_POWER8_1TO2LPAR; - stat_bit = HID0_POWER8_2LPARMODE; - } else { - split = 4; - cmd_bit = HID0_POWER8_1TO4LPAR; - stat_bit = HID0_POWER8_4LPARMODE; - } - subcore_size = MAX_SMT_THREADS / split; + is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S) + && !cpu_has_feature(CPU_FTR_ARCH_300); + + if (split > 1 || hpt_on_radix) { sip = &split_info; memset(&split_info, 0, sizeof(split_info)); - split_info.rpr = mfspr(SPRN_RPR); - split_info.pmmar = mfspr(SPRN_PMMAR); - split_info.ldbar = mfspr(SPRN_LDBAR); - split_info.subcore_size = subcore_size; for (sub = 0; sub < core_info.n_subcores; ++sub) split_info.vc[sub] = core_info.vc[sub]; + + if (is_power8) { + if (split == 2 && (dynamic_mt_modes & 2)) { + cmd_bit = HID0_POWER8_1TO2LPAR; + stat_bit = HID0_POWER8_2LPARMODE; + } else { + split = 4; + cmd_bit = HID0_POWER8_1TO4LPAR; + stat_bit = HID0_POWER8_4LPARMODE; + } + subcore_size = MAX_SMT_THREADS / split; + split_info.rpr = mfspr(SPRN_RPR); + split_info.pmmar = mfspr(SPRN_PMMAR); + split_info.ldbar = mfspr(SPRN_LDBAR); + split_info.subcore_size = subcore_size; + } else { + split_info.subcore_size = 1; + if (hpt_on_radix) { + /* Use the split_info for LPCR/LPIDR changes */ + split_info.lpcr_req = vc->lpcr; + split_info.lpidr_req = vc->kvm->arch.lpid; + split_info.host_lpcr = vc->kvm->arch.host_lpcr; + split_info.do_set = 1; + } + } + /* order writes to split_info before kvm_split_mode pointer */ smp_wmb(); } - for (thr = 0; thr < controlled_threads; ++thr) + + for (thr = 0; thr < controlled_threads; ++thr) { + paca[pcpu + thr].kvm_hstate.tid = thr; + paca[pcpu + thr].kvm_hstate.napping = 0; paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; + } - /* Initiate micro-threading (split-core) if required */ + /* Initiate micro-threading (split-core) on POWER8 if required */ if (cmd_bit) { unsigned long hid0 = mfspr(SPRN_HID0); @@ -2778,7 +2812,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { - thr = subcore_thread_map[sub]; + thr = is_power8 ? subcore_thread_map[sub] : sub; thr0_done = false; active |= 1 << thr; pvc = core_info.vc[sub]; @@ -2805,18 +2839,20 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * the vcore pointer in the PACA of the secondaries. */ smp_mb(); - if (cmd_bit) - split_info.do_nap = 1; /* ask secondaries to nap when done */ /* * When doing micro-threading, poke the inactive threads as well. * This gets them to the nap instruction after kvm_do_nap, * which reduces the time taken to unsplit later. + * For POWER9 HPT guest on radix host, we need all the secondary + * threads woken up so they can do the LPCR/LPIDR change. */ - if (split > 1) + if (cmd_bit || hpt_on_radix) { + split_info.do_nap = 1; /* ask secondaries to nap when done */ for (thr = 1; thr < threads_per_subcore; ++thr) if (!(active & (1 << thr))) kvmppc_ipi_thread(pcpu + thr); + } vc->vcore_state = VCORE_RUNNING; preempt_disable(); @@ -2850,10 +2886,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_EXITING; /* wait for secondary threads to finish writing their state to memory */ - kvmppc_wait_for_nap(); + kvmppc_wait_for_nap(controlled_threads); /* Return to whole-core mode if we split the core earlier */ - if (split > 1) { + if (cmd_bit) { unsigned long hid0 = mfspr(SPRN_HID0); unsigned long loops = 0; @@ -2869,8 +2905,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cpu_relax(); ++loops; } - split_info.do_nap = 0; + } else if (hpt_on_radix) { + /* Wait for all threads to have seen final sync */ + for (thr = 1; thr < controlled_threads; ++thr) { + while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) { + HMT_low(); + barrier(); + } + HMT_medium(); + } } + split_info.do_nap = 0; kvmppc_set_host_core(pcpu); @@ -3079,6 +3124,25 @@ out: trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } +static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) +{ + int r = 0; + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->lock); + if (!kvm->arch.mmu_ready) { + if (!kvm_is_radix(kvm)) + r = kvmppc_hv_setup_htab_rma(vcpu); + if (!r) { + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvmppc_setup_partition_table(kvm); + kvm->arch.mmu_ready = 1; + } + } + mutex_unlock(&kvm->lock); + return r; +} + static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int n_ceded, i, r; @@ -3135,15 +3199,15 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && !signal_pending(current)) { - /* See if the HPT and VRMA are ready to go */ - if (!kvm_is_radix(vcpu->kvm) && - !vcpu->kvm->arch.hpte_setup_done) { + /* See if the MMU is ready to go */ + if (!vcpu->kvm->arch.mmu_ready) { spin_unlock(&vc->lock); - r = kvmppc_hv_setup_htab_rma(vcpu); + r = kvmhv_setup_mmu(vcpu); spin_lock(&vc->lock); if (r) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason = 0; + kvm_run->fail_entry. + hardware_entry_failure_reason = 0; vcpu->arch.ret = r; break; } @@ -3225,6 +3289,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) unsigned long ebb_regs[3] = {}; /* shut up GCC */ unsigned long user_tar = 0; unsigned int user_vrsave; + struct kvm *kvm; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -3262,8 +3327,9 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINTR; } - atomic_inc(&vcpu->kvm->arch.vcpus_running); - /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ + kvm = vcpu->kvm; + atomic_inc(&kvm->arch.vcpus_running); + /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */ smp_mb(); flush_all_to_thread(current); @@ -3291,10 +3357,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) trace_kvm_hcall_exit(vcpu, r); kvmppc_core_prepare_to_enter(vcpu); } else if (r == RESUME_PAGE_FAULT) { - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + srcu_idx = srcu_read_lock(&kvm->srcu); r = kvmppc_book3s_hv_page_fault(run, vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + srcu_read_unlock(&kvm->srcu, srcu_idx); } else if (r == RESUME_PASSTHROUGH) { if (WARN_ON(xive_enabled())) r = H_SUCCESS; @@ -3314,27 +3380,26 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) mtspr(SPRN_VRSAVE, user_vrsave); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; - atomic_dec(&vcpu->kvm->arch.vcpus_running); + atomic_dec(&kvm->arch.vcpus_running); return r; } static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, - int linux_psize) + int shift, int sllp) { - struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; - - if (!def->shift) - return; - (*sps)->page_shift = def->shift; - (*sps)->slb_enc = def->sllp; - (*sps)->enc[0].page_shift = def->shift; - (*sps)->enc[0].pte_enc = def->penc[linux_psize]; + (*sps)->page_shift = shift; + (*sps)->slb_enc = sllp; + (*sps)->enc[0].page_shift = shift; + (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift); /* - * Add 16MB MPSS support if host supports it + * Add 16MB MPSS support (may get filtered out by userspace) */ - if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { - (*sps)->enc[1].page_shift = 24; - (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; + if (shift != 24) { + int penc = kvmppc_pgsize_lp_encoding(shift, 24); + if (penc != -1) { + (*sps)->enc[1].page_shift = 24; + (*sps)->enc[1].pte_enc = penc; + } } (*sps)++; } @@ -3345,13 +3410,6 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, struct kvm_ppc_one_seg_page_size *sps; /* - * Since we don't yet support HPT guests on a radix host, - * return an error if the host uses radix. - */ - if (radix_enabled()) - return -EINVAL; - - /* * POWER7, POWER8 and POWER9 all support 32 storage keys for data. * POWER7 doesn't support keys for instruction accesses, * POWER8 and POWER9 do. @@ -3359,16 +3417,15 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, info->data_keys = 32; info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0; - info->flags = KVM_PPC_PAGE_SIZES_REAL; - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) - info->flags |= KVM_PPC_1T_SEGMENTS; - info->slb_size = mmu_slb_size; + /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */ + info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS; + info->slb_size = 32; /* We only support these sizes for now, and no muti-size segments */ sps = &info->sps[0]; - kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); - kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); - kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); + kvmppc_add_seg_page_size(&sps, 12, 0); + kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); + kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); return 0; } @@ -3383,7 +3440,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, struct kvm_memory_slot *memslot; int i, r; unsigned long n; - unsigned long *buf; + unsigned long *buf, *p; struct kvm_vcpu *vcpu; mutex_lock(&kvm->slots_lock); @@ -3399,8 +3456,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, goto out; /* - * Use second half of bitmap area because radix accumulates - * bits in the first half. + * Use second half of bitmap area because both HPT and radix + * accumulate bits in the first half. */ n = kvm_dirty_bitmap_bytes(memslot); buf = memslot->dirty_bitmap + n / sizeof(long); @@ -3413,6 +3470,16 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, if (r) goto out; + /* + * We accumulate dirty bits in the first half of the + * memslot's dirty_bitmap area, for when pages are paged + * out or modified by the host directly. Pick up these + * bits and add them to the map. + */ + p = memslot->dirty_bitmap; + for (i = 0; i < n / sizeof(long); ++i) + buf[i] |= xchg(&p[i], 0); + /* Harvest dirty bits from VPA and DTL updates */ /* Note: we never modify the SLB shadow buffer areas */ kvm_for_each_vcpu(i, vcpu, kvm) { @@ -3444,15 +3511,6 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, unsigned long npages) { - /* - * For now, if radix_enabled() then we only support radix guests, - * and in that case we don't need the rmap array. - */ - if (radix_enabled()) { - slot->arch.rmap = NULL; - return 0; - } - slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) return -ENOMEM; @@ -3473,8 +3531,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, const struct kvm_memory_slot *new) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; /* * If we are making a new memslot, it might make @@ -3484,18 +3540,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, */ if (npages) atomic64_inc(&kvm->arch.mmio_update); - - if (npages && old->npages && !kvm_is_radix(kvm)) { - /* - * If modifying a memslot, reset all the rmap dirty bits. - * If this is a new memslot, we don't need to do anything - * since the rmap array starts out as all zeroes, - * i.e. no pages are dirty. - */ - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, mem->slot); - kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL); - } } /* @@ -3529,7 +3573,7 @@ static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) return; } -static void kvmppc_setup_partition_table(struct kvm *kvm) +void kvmppc_setup_partition_table(struct kvm *kvm) { unsigned long dw0, dw1; @@ -3551,6 +3595,10 @@ static void kvmppc_setup_partition_table(struct kvm *kvm) mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); } +/* + * Set up HPT (hashed page table) and RMA (real-mode area). + * Must be called with kvm->lock held. + */ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; @@ -3562,10 +3610,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) unsigned long psize, porder; int srcu_idx; - mutex_lock(&kvm->lock); - if (kvm->arch.hpte_setup_done) - goto out; /* another vcpu beat us to it */ - /* Allocate hashed page table (if not done already) and reset it */ if (!kvm->arch.hpt.virt) { int order = KVM_DEFAULT_HPT_ORDER; @@ -3624,18 +3668,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* the -4 is to account for senc values starting at 0x10 */ lpcr = senc << (LPCR_VRMASD_SH - 4); kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - } else { - kvmppc_setup_partition_table(kvm); } - /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ + /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */ smp_wmb(); - kvm->arch.hpte_setup_done = 1; err = 0; out_srcu: srcu_read_unlock(&kvm->srcu, srcu_idx); out: - mutex_unlock(&kvm->lock); return err; up_out: @@ -3643,6 +3683,34 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } +/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) +{ + kvmppc_free_radix(kvm); + kvmppc_update_lpcr(kvm, LPCR_VPM1, + LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvmppc_rmap_reset(kvm); + kvm->arch.radix = 0; + kvm->arch.process_table = 0; + return 0; +} + +/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +int kvmppc_switch_mmu_to_radix(struct kvm *kvm) +{ + int err; + + err = kvmppc_init_vm_radix(kvm); + if (err) + return err; + + kvmppc_free_hpt(&kvm->arch.hpt); + kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, + LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvm->arch.radix = 1; + return 0; +} + #ifdef CONFIG_KVM_XICS /* * Allocate a per-core structure for managing state about which cores are @@ -3786,10 +3854,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) } /* - * For now, if the host uses radix, the guest must be radix. + * If the host uses radix, the guest starts out as radix. */ if (radix_enabled()) { kvm->arch.radix = 1; + kvm->arch.mmu_ready = 1; lpcr &= ~LPCR_VPM1; lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; ret = kvmppc_init_vm_radix(kvm); @@ -3809,7 +3878,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S. */ - if (kvm_is_radix(kvm)) + if (radix_enabled()) kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ else if (cpu_has_feature(CPU_FTR_ARCH_300)) kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ @@ -3821,10 +3890,12 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. - * On POWER9, we only need to do this for HPT guests on a radix - * host, which is not yet supported. + * On POWER9, we only need to do this if the "indep_threads_mode" + * module parameter has been set to N. */ - if (!cpu_has_feature(CPU_FTR_ARCH_300)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.threads_indep = indep_threads_mode; + if (!kvm->arch.threads_indep) kvm_hv_vm_activated(); /* @@ -3864,7 +3935,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { debugfs_remove_recursive(kvm->arch.debugfs_dir); - if (!cpu_has_feature(CPU_FTR_ARCH_300)) + if (!kvm->arch.threads_indep) kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); @@ -4199,6 +4270,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) { unsigned long lpcr; int radix; + int err; /* If not on a POWER9, reject it */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) @@ -4208,12 +4280,8 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) return -EINVAL; - /* We can't change a guest to/from radix yet */ - radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); - if (radix != kvm_is_radix(kvm)) - return -EINVAL; - /* GR (guest radix) bit in process_table field must match */ + radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); if (!!(cfg->process_table & PATB_GR) != radix) return -EINVAL; @@ -4221,15 +4289,40 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if ((cfg->process_table & PRTS_MASK) > 24) return -EINVAL; + /* We can change a guest to/from radix now, if the host is radix */ + if (radix && !radix_enabled()) + return -EINVAL; + mutex_lock(&kvm->lock); + if (radix != kvm_is_radix(kvm)) { + if (kvm->arch.mmu_ready) { + kvm->arch.mmu_ready = 0; + /* order mmu_ready vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.mmu_ready = 1; + err = -EBUSY; + goto out_unlock; + } + } + if (radix) + err = kvmppc_switch_mmu_to_radix(kvm); + else + err = kvmppc_switch_mmu_to_hpt(kvm); + if (err) + goto out_unlock; + } + kvm->arch.process_table = cfg->process_table; kvmppc_setup_partition_table(kvm); lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); - mutex_unlock(&kvm->lock); + err = 0; - return 0; + out_unlock: + mutex_unlock(&kvm->lock); + return err; } static struct kvmppc_ops kvm_ops_hv = { @@ -4371,4 +4464,3 @@ module_exit(kvmppc_book3s_exit_hv); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(KVM_MINOR); MODULE_ALIAS("devname:kvm"); - diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 90644db9d38e..49a2c7825e04 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap) struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; int ptid = local_paca->kvm_hstate.ptid; struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode; - int me, ee, i; + int me, ee, i, t; + int cpu0; /* Set our bit in the threads-exiting-guest map in the 0xff00 bits of vcore->entry_exit_map */ @@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap) if ((ee >> 8) == 0) kvmhv_interrupt_vcore(vc, ee); } + + /* + * On POWER9 when running a HPT guest on a radix host (sip != NULL), + * we have to interrupt inactive CPU threads to get them to + * restore the host LPCR value. + */ + if (sip->lpcr_req) { + if (cmpxchg(&sip->do_restore, 0, 1) == 0) { + vc = local_paca->kvm_hstate.kvm_vcore; + cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid; + for (t = 1; t < threads_per_core; ++t) { + if (sip->napped[t]) + kvmhv_rm_send_ipi(cpu0 + t); + } + } + } } struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; @@ -529,6 +546,8 @@ static inline bool is_rm(void) unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_xirr(vcpu); @@ -541,6 +560,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; vcpu->arch.gpr[5] = get_tb(); if (xive_enabled()) { if (is_rm()) @@ -554,6 +575,8 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_ipoll(vcpu, server); @@ -567,6 +590,8 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, unsigned long mfrr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_ipi(vcpu, server, mfrr); @@ -579,6 +604,8 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_cppr(vcpu, cppr); @@ -591,6 +618,8 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_eoi(vcpu, xirr); @@ -601,3 +630,89 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return xics_rm_h_eoi(vcpu, xirr); } #endif /* CONFIG_KVM_XICS */ + +void kvmppc_bad_interrupt(struct pt_regs *regs) +{ + die("Bad interrupt in KVM entry/exit code", regs, SIGABRT); + panic("Bad KVM trap"); +} + +/* + * Functions used to switch LPCR HR and UPRT bits on all threads + * when entering and exiting HPT guests on a radix host. + */ + +#define PHASE_REALMODE 1 /* in real mode */ +#define PHASE_SET_LPCR 2 /* have set LPCR */ +#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */ +#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */ + +#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p)) + +static void wait_for_sync(struct kvm_split_mode *sip, int phase) +{ + int thr = local_paca->kvm_hstate.tid; + + sip->lpcr_sync.phase[thr] |= phase; + phase = ALL(phase); + while ((sip->lpcr_sync.allphases & phase) != phase) { + HMT_low(); + barrier(); + } + HMT_medium(); +} + +void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip) +{ + unsigned long rb, set; + + /* wait for every other thread to get to real mode */ + wait_for_sync(sip, PHASE_REALMODE); + + /* Set LPCR and LPIDR */ + mtspr(SPRN_LPCR, sip->lpcr_req); + mtspr(SPRN_LPID, sip->lpidr_req); + isync(); + + /* Invalidate the TLB on thread 0 */ + if (local_paca->kvm_hstate.tid == 0) { + sip->do_set = 0; + asm volatile("ptesync" : : : "memory"); + for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) { + rb = TLBIEL_INVAL_SET_LPID + + (set << TLBIEL_INVAL_SET_SHIFT); + asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : : + "r" (rb), "r" (0)); + } + asm volatile("ptesync" : : : "memory"); + } + + /* indicate that we have done so and wait for others */ + wait_for_sync(sip, PHASE_SET_LPCR); + /* order read of sip->lpcr_sync.allphases vs. sip->do_set */ + smp_rmb(); +} + +/* + * Called when a thread that has been in the guest needs + * to reload the host LPCR value - but only on POWER9 when + * running a HPT guest on a radix host. + */ +void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) +{ + /* we're out of the guest... */ + wait_for_sync(sip, PHASE_OUT_OF_GUEST); + + mtspr(SPRN_LPID, 0); + mtspr(SPRN_LPCR, sip->host_lpcr); + isync(); + + if (local_paca->kvm_hstate.tid == 0) { + sip->do_restore = 0; + smp_wmb(); /* order store of do_restore vs. phase */ + } + + wait_for_sync(sip, PHASE_RESET_LPCR); + smp_mb(); + local_paca->kvm_hstate.kvm_split_mode = NULL; +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 4efe364f1188..26c11f678fbf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -107,30 +107,50 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -/* Update the changed page order field of an rmap entry */ -void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize) +/* Update the dirty bitmap of a memslot */ +void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, + unsigned long gfn, unsigned long psize) { - unsigned long order; + unsigned long npages; - if (!psize) + if (!psize || !memslot->dirty_bitmap) return; - order = ilog2(psize); - order <<= KVMPPC_RMAP_CHG_SHIFT; - if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER)) - *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order; + npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE; + gfn -= memslot->base_gfn; + set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages); +} +EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map); + +static void kvmppc_set_dirty_from_hpte(struct kvm *kvm, + unsigned long hpte_v, unsigned long hpte_gr) +{ + struct kvm_memory_slot *memslot; + unsigned long gfn; + unsigned long psize; + + psize = kvmppc_actual_pgsz(hpte_v, hpte_gr); + gfn = hpte_rpn(hpte_gr, psize); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (memslot && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, psize); } -EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change); /* Returns a pointer to the revmap entry for the page mapped by a HPTE */ static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v, - unsigned long hpte_gr) + unsigned long hpte_gr, + struct kvm_memory_slot **memslotp, + unsigned long *gfnp) { struct kvm_memory_slot *memslot; unsigned long *rmap; unsigned long gfn; - gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr)); + gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr)); memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (memslotp) + *memslotp = memslot; + if (gfnp) + *gfnp = gfn; if (!memslot) return NULL; @@ -147,10 +167,12 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unsigned long ptel, head; unsigned long *rmap; unsigned long rcbits; + struct kvm_memory_slot *memslot; + unsigned long gfn; rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); ptel = rev->guest_rpte |= rcbits; - rmap = revmap_for_hpte(kvm, hpte_v, ptel); + rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn); if (!rmap) return; lock_rmap(rmap); @@ -169,7 +191,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, } *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r)); + kvmppc_update_dirty_map(memslot, gfn, + kvmppc_actual_pgsz(hpte_v, hpte_r)); unlock_rmap(rmap); } @@ -193,7 +216,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - psize = hpte_page_size(pteh, ptel); + psize = kvmppc_actual_pgsz(pteh, ptel); if (!psize) return H_PARAMETER; writing = hpte_is_writable(ptel); @@ -797,7 +820,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, gr |= r & (HPTE_R_R | HPTE_R_C); if (r & HPTE_R_R) { kvmppc_clear_ref_hpte(kvm, hpte, pte_index); - rmap = revmap_for_hpte(kvm, v, gr); + rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL); if (rmap) { lock_rmap(rmap); *rmap |= KVMPPC_RMAP_REFERENCED; @@ -819,7 +842,6 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, __be64 *hpte; unsigned long v, r, gr; struct revmap_entry *rev; - unsigned long *rmap; long ret = H_NOT_FOUND; if (kvm_is_radix(kvm)) @@ -848,16 +870,9 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, r = be64_to_cpu(hpte[1]); gr |= r & (HPTE_R_R | HPTE_R_C); if (r & HPTE_R_C) { - unsigned long psize = hpte_page_size(v, r); hpte[1] = cpu_to_be64(r & ~HPTE_R_C); eieio(); - rmap = revmap_for_hpte(kvm, v, gr); - if (rmap) { - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - kvmppc_update_rmap_change(rmap, psize); - unlock_rmap(rmap); - } + kvmppc_set_dirty_from_hpte(kvm, v, gr); } } vcpu->arch.gpr[4] = gr; @@ -1014,7 +1029,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, * Check the HPTE again, including base page size */ if ((v & valid) && (v & mask) == val && - hpte_base_page_size(v, r) == (1ul << pshift)) + kvmppc_hpte_base_page_shift(v, r) == pshift) /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 42639fba89e8..9c61f736c75b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -31,6 +31,7 @@ #include <asm/tm.h> #include <asm/opal.h> #include <asm/xive-regs.h> +#include <asm/thread_info.h> /* Sign-extend HDEC if not on POWER9 */ #define EXTEND_HDEC(reg) \ @@ -78,9 +79,22 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) mtmsrd r0,1 /* clear RI in MSR */ mtsrr0 r5 mtsrr1 r6 - RFI + RFI_TO_KERNEL kvmppc_call_hv_entry: +BEGIN_FTR_SECTION + /* On P9, do LPCR setting, if necessary */ + ld r3, HSTATE_SPLIT_MODE(r13) + cmpdi r3, 0 + beq 46f + lwz r4, KVM_SPLIT_DO_SET(r3) + cmpwi r4, 0 + beq 46f + bl kvmhv_p9_set_lpcr + nop +46: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r4, HSTATE_KVM_VCPU(r13) bl kvmppc_hv_entry @@ -149,11 +163,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) subf r4, r4, r3 mtspr SPRN_DEC, r4 -BEGIN_FTR_SECTION /* hwthread_req may have got set by cede or no vcpu, so clear it */ li r0, 0 stb r0, HSTATE_HWTHREAD_REQ(r13) -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* * For external interrupts we need to call the Linux @@ -187,7 +199,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - RFI + RFI_TO_KERNEL /* Virtual-mode return */ .Lvirt_return: @@ -316,7 +328,6 @@ kvm_novcpu_exit: * Relocation is off and most register values are lost. * r13 points to the PACA. * r3 contains the SRR1 wakeup value, SRR1 is trashed. - * This is not used by ISAv3.0B processors. */ .globl kvm_start_guest kvm_start_guest: @@ -390,6 +401,7 @@ kvm_secondary_got_guest: ld r6, HSTATE_SPLIT_MODE(r13) cmpdi r6, 0 beq 63f +BEGIN_FTR_SECTION ld r0, KVM_SPLIT_RPR(r6) mtspr SPRN_RPR, r0 ld r0, KVM_SPLIT_PMMAR(r6) @@ -397,6 +409,15 @@ kvm_secondary_got_guest: ld r0, KVM_SPLIT_LDBAR(r6) mtspr SPRN_LDBAR, r0 isync +FTR_SECTION_ELSE + /* On P9 we use the split_info for coordinating LPCR changes */ + lwz r4, KVM_SPLIT_DO_SET(r6) + cmpwi r4, 0 + beq 63f + mr r3, r6 + bl kvmhv_p9_set_lpcr + nop +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 63: /* Order load of vcpu after load of vcore */ lwsync @@ -435,9 +456,6 @@ kvm_secondary_got_guest: * While waiting we also need to check if we get given a vcpu to run. */ kvm_no_guest: -BEGIN_FTR_SECTION - twi 31,0,0 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r3, HSTATE_HWTHREAD_REQ(r13) cmpwi r3, 0 bne 53f @@ -470,6 +488,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r3, HSTATE_SPLIT_MODE(r13) cmpdi r3, 0 beq kvm_no_guest + lwz r0, KVM_SPLIT_DO_SET(r3) + cmpwi r0, 0 + bne kvmhv_do_set + lwz r0, KVM_SPLIT_DO_RESTORE(r3) + cmpwi r0, 0 + bne kvmhv_do_restore lbz r0, KVM_SPLIT_DO_NAP(r3) cmpwi r0, 0 beq kvm_no_guest @@ -482,6 +506,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) stb r0, HSTATE_HWTHREAD_STATE(r13) b kvm_no_guest +kvmhv_do_set: + /* Set LPCR, LPIDR etc. on P9 */ + HMT_MEDIUM + bl kvmhv_p9_set_lpcr + nop + b kvm_no_guest + +kvmhv_do_restore: + HMT_MEDIUM + bl kvmhv_p9_restore_lpcr + nop + b kvm_no_guest + /* * Here the primary thread is trying to return the core to * whole-core mode, so we need to nap. @@ -519,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Set kvm_split_mode.napped[tid] = 1 */ ld r3, HSTATE_SPLIT_MODE(r13) li r0, 1 - lhz r4, PACAPACAINDEX(r13) - clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */ + lbz r4, HSTATE_TID(r13) addi r4, r4, KVM_SPLIT_NAPPED stbx r0, r3, r4 /* Check the do_nap flag again after setting napped[] */ @@ -1131,8 +1167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) - - hrfid + HRFI_TO_GUEST b . secondary_too_late: @@ -1917,10 +1952,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 19: lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 -16: ld r8,KVM_HOST_LPCR(r4) +16: +BEGIN_FTR_SECTION + /* On POWER9 with HPT-on-radix we need to wait for all other threads */ + ld r3, HSTATE_SPLIT_MODE(r13) + cmpdi r3, 0 + beq 47f + lwz r8, KVM_SPLIT_DO_RESTORE(r3) + cmpwi r8, 0 + beq 47f + stw r12, STACK_SLOT_TRAP(r1) + bl kvmhv_p9_restore_lpcr + nop + lwz r12, STACK_SLOT_TRAP(r1) + b 48f +47: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r8,KVM_HOST_LPCR(r4) mtspr SPRN_LPCR,r8 isync - +48: /* load host SLB entries */ BEGIN_MMU_FTR_SECTION b 0f @@ -2546,10 +2597,8 @@ kvm_do_nap: clrrdi r0, r0, 1 mtspr SPRN_CTRLT, r0 -BEGIN_FTR_SECTION li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION @@ -3141,10 +3190,139 @@ kvmppc_restore_tm: /* * We come here if we get any exception or interrupt while we are * executing host real mode code while in guest MMU context. - * For now just spin, but we should do something better. + * r12 is (CR << 32) | vector + * r13 points to our PACA + * r12 is saved in HSTATE_SCRATCH0(r13) + * ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE + * r9 is saved in HSTATE_SCRATCH2(r13) + * r13 is saved in HSPRG1 + * cfar is saved in HSTATE_CFAR(r13) + * ppr is saved in HSTATE_PPR(r13) */ kvmppc_bad_host_intr: + /* + * Switch to the emergency stack, but start half-way down in + * case we were already on it. + */ + mr r9, r1 + std r1, PACAR1(r13) + ld r1, PACAEMERGSP(r13) + subi r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE + std r9, 0(r1) + std r0, GPR0(r1) + std r9, GPR1(r1) + std r2, GPR2(r1) + SAVE_4GPRS(3, r1) + SAVE_2GPRS(7, r1) + srdi r0, r12, 32 + clrldi r12, r12, 32 + std r0, _CCR(r1) + std r12, _TRAP(r1) + andi. r0, r12, 2 + beq 1f + mfspr r3, SPRN_HSRR0 + mfspr r4, SPRN_HSRR1 + mfspr r5, SPRN_HDAR + mfspr r6, SPRN_HDSISR + b 2f +1: mfspr r3, SPRN_SRR0 + mfspr r4, SPRN_SRR1 + mfspr r5, SPRN_DAR + mfspr r6, SPRN_DSISR +2: std r3, _NIP(r1) + std r4, _MSR(r1) + std r5, _DAR(r1) + std r6, _DSISR(r1) + ld r9, HSTATE_SCRATCH2(r13) + ld r12, HSTATE_SCRATCH0(r13) + GET_SCRATCH0(r0) + SAVE_4GPRS(9, r1) + std r0, GPR13(r1) + SAVE_NVGPRS(r1) + ld r5, HSTATE_CFAR(r13) + std r5, ORIG_GPR3(r1) + mflr r3 +#ifdef CONFIG_RELOCATABLE + ld r4, HSTATE_SCRATCH1(r13) +#else + mfctr r4 +#endif + mfxer r5 + lbz r6, PACASOFTIRQEN(r13) + std r3, _LINK(r1) + std r4, _CTR(r1) + std r5, _XER(r1) + std r6, SOFTE(r1) + ld r2, PACATOC(r13) + LOAD_REG_IMMEDIATE(3, 0x7265677368657265) + std r3, STACK_FRAME_OVERHEAD-16(r1) + + /* + * On POWER9 do a minimal restore of the MMU and call C code, + * which will print a message and panic. + * XXX On POWER7 and POWER8, we just spin here since we don't + * know what the other threads are doing (and we don't want to + * coordinate with them) - but at least we now have register state + * in memory that we might be able to look at from another CPU. + */ +BEGIN_FTR_SECTION b . +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_KVM(r9) + + li r0, 0 + mtspr SPRN_AMR, r0 + mtspr SPRN_IAMR, r0 + mtspr SPRN_CIABR, r0 + mtspr SPRN_DAWRX, r0 + + /* Flush the ERAT on radix P9 DD1 guest exit */ +BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) + +BEGIN_MMU_FTR_SECTION + b 4f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) + + slbmte r0, r0 + slbia + ptesync + ld r8, PACA_SLBSHADOWPTR(r13) + .rept SLB_NUM_BOLTED + li r3, SLBSHADOW_SAVEAREA + LDX_BE r5, r8, r3 + addi r3, r3, 8 + LDX_BE r6, r8, r3 + andis. r7, r5, SLB_ESID_V@h + beq 3f + slbmte r6, r5 +3: addi r8, r8, 16 + .endr + +4: lwz r7, KVM_HOST_LPID(r10) + mtspr SPRN_LPID, r7 + mtspr SPRN_PID, r0 + ld r8, KVM_HOST_LPCR(r10) + mtspr SPRN_LPCR, r8 + isync + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + /* + * Turn on the MMU and jump to C code + */ + bcl 20, 31, .+4 +5: mflr r3 + addi r3, r3, 9f - 5b + ld r4, PACAKMSR(r13) + mtspr SPRN_SRR0, r3 + mtspr SPRN_SRR1, r4 + RFI_TO_KERNEL +9: addi r3, r1, STACK_FRAME_OVERHEAD + bl kvmppc_bad_interrupt + b 9b /* * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 69a09444d46e..7deaeeb14b93 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -60,6 +60,7 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE +#define HPTE_R_M _PAGE_COHERENT #endif static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu) @@ -557,6 +558,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.eaddr = eaddr; pte.vpage = eaddr >> 12; pte.page_size = MMU_PAGE_64K; + pte.wimg = HPTE_R_M; } switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { @@ -1326,12 +1328,22 @@ static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, kvmppc_set_pvr_pr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; +#ifdef CONFIG_PPC_BOOK3S_64 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { + /* Flush all SLB entries */ + vcpu->arch.mmu.slbmte(vcpu, 0, 0); + vcpu->arch.mmu.slbia(vcpu); + for (i = 0; i < 64; i++) { - vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, - sregs->u.s.ppc64.slb[i].slbe); + u64 rb = sregs->u.s.ppc64.slb[i].slbe; + u64 rs = sregs->u.s.ppc64.slb[i].slbv; + + if (rb & SLB_ESID_V) + vcpu->arch.mmu.slbmte(vcpu, rs, rb); } - } else { + } else +#endif + { for (i = 0; i < 16; i++) { vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); } diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index 8a4205fa774f..dae3be5ff42b 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd) case H_PROTECT: case H_BULK_REMOVE: case H_PUT_TCE: + case H_PUT_TCE_INDIRECT: + case H_STUFF_TCE: case H_CEDE: case H_LOGICAL_CI_LOAD: case H_LOGICAL_CI_STORE: diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 42a4b237df5f..34a5adeff084 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -46,6 +46,9 @@ #define FUNC(name) name +#define RFI_TO_KERNEL RFI +#define RFI_TO_GUEST RFI + .macro INTERRUPT_TRAMPOLINE intno .global kvmppc_trampoline_\intno @@ -141,7 +144,7 @@ kvmppc_handler_skip_ins: GET_SCRATCH0(r13) /* And get back into the code */ - RFI + RFI_TO_KERNEL #endif /* @@ -164,6 +167,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline) ori r5, r5, MSR_EE mtsrr0 r7 mtsrr1 r6 - RFI + RFI_TO_KERNEL #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 2a2b96d53999..93a180ceefad 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -156,7 +156,7 @@ no_dcbz32_on: PPC_LL r9, SVCPU_R9(r3) PPC_LL r3, (SVCPU_R3)(r3) - RFI + RFI_TO_GUEST kvmppc_handler_trampoline_enter_end: @@ -407,5 +407,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) cmpwi r12, BOOK3S_INTERRUPT_DOORBELL beqa BOOK3S_INTERRUPT_DOORBELL - RFI + RFI_TO_KERNEL kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index bf457843e032..0d750d274c4e 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) /* Return the per-cpu state for state saving/migration */ return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT | - (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT; + (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT | + (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT; } int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) @@ -1558,7 +1559,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) /* * Restore P and Q. If the interrupt was pending, we - * force both P and Q, which will trigger a resend. + * force Q and !P, which will trigger a resend. * * That means that a guest that had both an interrupt * pending (queued) and Q set will restore with only @@ -1566,7 +1567,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) * is perfectly fine as coalescing interrupts that haven't * been presented yet is always allowed. */ - if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) + if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING)) state->old_p = true; if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING) state->old_q = true; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 071b87ee682f..83b485810aea 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -599,9 +599,9 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu) spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); } -void kvmppc_watchdog_func(unsigned long data) +void kvmppc_watchdog_func(struct timer_list *t) { - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.wdt_timer); u32 tsr, new_tsr; int final; @@ -1412,8 +1412,7 @@ int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) { /* setup watchdog timer once */ spin_lock_init(&vcpu->arch.wdt_lock); - setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, - (unsigned long)vcpu); + timer_setup(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, 0); /* * Clear DBSR.MRR to avoid guest debug interrupt as diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c6c734424c70..423b21393bc9 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -377,7 +377,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, start = vma->vm_pgoff; end = start + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + vma_pages(vma); pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ee279c7f4802..0a7c88786ec0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,6 +39,10 @@ #include <asm/iommu.h> #include <asm/switch_to.h> #include <asm/xive.h> +#ifdef CONFIG_PPC_PSERIES +#include <asm/hvcall.h> +#include <asm/plpar_wrappers.h> +#endif #include "timing.h" #include "irq.h" @@ -548,6 +552,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_KVM_XICS case KVM_CAP_IRQ_XICS: #endif + case KVM_CAP_PPC_GET_CPU_CHAR: r = 1; break; @@ -590,8 +595,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: - r = !!(hv_enabled && !radix_enabled() && - cpu_has_feature(CPU_FTR_ARCH_300)); + r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); break; #endif case KVM_CAP_SYNC_MMU: @@ -644,7 +648,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_PPC_HTM: - r = cpu_has_feature(CPU_FTR_TM_COMP) && hv_enabled; + r = hv_enabled && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM_COMP); break; default: r = 0; @@ -1407,7 +1412,6 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r; - sigset_t sigsaved; if (vcpu->mmio_needed) { vcpu->mmio_needed = 0; @@ -1448,16 +1452,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) #endif } - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + kvm_sigset_activate(vcpu); if (run->immediate_exit) r = -EINTR; else r = kvmppc_vcpu_run(run, vcpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + kvm_sigset_deactivate(vcpu); return r; } @@ -1762,6 +1764,124 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, return r; } +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * These functions check whether the underlying hardware is safe + * against attacks based on observing the effects of speculatively + * executed instructions, and whether it supplies instructions for + * use in workarounds. The information comes from firmware, either + * via the device tree on powernv platforms or from an hcall on + * pseries platforms. + */ +#ifdef CONFIG_PPC_PSERIES +static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp) +{ + struct h_cpu_char_result c; + unsigned long rc; + + if (!machine_is(pseries)) + return -ENOTTY; + + rc = plpar_get_cpu_characteristics(&c); + if (rc == H_SUCCESS) { + cp->character = c.character; + cp->behaviour = c.behaviour; + cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 | + KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED | + KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 | + KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 | + KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | + KVM_PPC_CPU_CHAR_BR_HINT_HONOURED | + KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF | + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | + KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + } + return 0; +} +#else +static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp) +{ + return -ENOTTY; +} +#endif + +static inline bool have_fw_feat(struct device_node *fw_features, + const char *state, const char *name) +{ + struct device_node *np; + bool r = false; + + np = of_get_child_by_name(fw_features, name); + if (np) { + r = of_property_read_bool(np, state); + of_node_put(np); + } + return r; +} + +static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) +{ + struct device_node *np, *fw_features; + int r; + + memset(cp, 0, sizeof(*cp)); + r = pseries_get_cpu_char(cp); + if (r != -ENOTTY) + return r; + + np = of_find_node_by_name(NULL, "ibm,opal"); + if (np) { + fw_features = of_get_child_by_name(np, "fw-features"); + of_node_put(np); + if (!fw_features) + return 0; + if (have_fw_feat(fw_features, "enabled", + "inst-spec-barrier-ori31,31,0")) + cp->character |= KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31; + if (have_fw_feat(fw_features, "enabled", + "fw-bcctrl-serialized")) + cp->character |= KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED; + if (have_fw_feat(fw_features, "enabled", + "inst-l1d-flush-ori30,30,0")) + cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30; + if (have_fw_feat(fw_features, "enabled", + "inst-l1d-flush-trig2")) + cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2; + if (have_fw_feat(fw_features, "enabled", + "fw-l1d-thread-split")) + cp->character |= KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV; + if (have_fw_feat(fw_features, "enabled", + "fw-count-cache-disabled")) + cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 | + KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED | + KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 | + KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 | + KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + + if (have_fw_feat(fw_features, "enabled", + "speculation-policy-favor-security")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY; + if (!have_fw_feat(fw_features, "disabled", + "needs-l1d-flush-msr-pr-0-to-1")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR; + if (!have_fw_feat(fw_features, "disabled", + "needs-spec-barrier-for-bound-checks")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | + KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + + of_node_put(fw_features); + } + + return 0; +} +#endif + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1864,6 +1984,14 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_PPC_GET_CPU_CHAR: { + struct kvm_ppc_cpu_char cpuchar; + + r = kvmppc_get_cpu_char(&cpuchar); + if (r >= 0 && copy_to_user(argp, &cpuchar, sizeof(cpuchar))) + r = -EFAULT; + break; + } default: { struct kvm *kvm = filp->private_data; r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); |