From a4429e53c9b3082b05e51224c3d58dbdd39306c5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Feb 2018 09:05:40 +0800 Subject: KVM: Introduce paravirtualization hints and KVM_HINTS_DEDICATED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces kvm_para_has_hint() to query for hints about the configuration of the guests. The first hint KVM_HINTS_DEDICATED, is set if the guest has dedicated physical CPUs for each vCPU (i.e. pinning and no over-commitment). This allows optimizing spinlocks and tells the guest to avoid PV TLB flush. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Eduardo Habkost Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/powerpc/include/asm/kvm_para.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h index 336a91acb8b1..5ceb4efca65f 100644 --- a/arch/powerpc/include/asm/kvm_para.h +++ b/arch/powerpc/include/asm/kvm_para.h @@ -61,6 +61,11 @@ static inline unsigned int kvm_arch_para_features(void) return r; } +static inline unsigned int kvm_arch_para_hints(void) +{ + return 0; +} + static inline bool kvm_check_and_clear_guest_paused(void) { return false; -- cgit v1.2.3 From 39c983ea0f96a270d4876c4148e3bb2d9cd3294f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 22 Feb 2018 15:16:54 +1100 Subject: KVM: PPC: Remove unused kvm_unmap_hva callback Since commit fb1522e099f0 ("KVM: update to new mmu_notifier semantic v2", 2017-08-31), the MMU notifier code in KVM no longer calls the kvm_unmap_hva callback. This removes the PPC implementations of kvm_unmap_hva(). Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/include/asm/kvm_ppc.h | 1 - arch/powerpc/kvm/book3s.c | 6 ------ arch/powerpc/kvm/book3s.h | 1 - arch/powerpc/kvm/book3s_64_mmu_hv.c | 9 --------- arch/powerpc/kvm/book3s_64_vio_hv.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 1 - arch/powerpc/kvm/book3s_pr.c | 10 ---------- arch/powerpc/kvm/e500_mmu_host.c | 2 +- arch/powerpc/kvm/trace_pr.h | 15 --------------- 10 files changed, 2 insertions(+), 46 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1f53b562726f..6b69d7999381 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -60,7 +60,6 @@ #define KVM_ARCH_WANT_MMU_NOTIFIER -extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 7765a800ddae..23cfaef9544e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -295,7 +295,6 @@ struct kvmppc_ops { const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, const struct kvm_memory_slot *new); - int (*unmap_hva)(struct kvm *kvm, unsigned long hva); int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, unsigned long end); int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 234531d1bee1..97d4a112648f 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -819,12 +819,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new); } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm->arch.kvm_ops->unmap_hva(kvm, hva); -} -EXPORT_SYMBOL_GPL(kvm_unmap_hva); - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index d2b3ec088b8c..4ad5e287b8bc 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -14,7 +14,6 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot); -extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index ef243fed2f2b..a670fa5fbe50 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -877,15 +877,6 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, return 0; } -int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) -{ - hva_handler_fn handler; - - handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; - kvm_handle_hva(kvm, hva, handler); - return 0; -} - int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) { hva_handler_fn handler; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index c32e9bfe75b1..6651f736a0b1 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -450,7 +450,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, /* * Synchronize with the MMU notifier callbacks in - * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * book3s_64_mmu_hv.c (kvm_unmap_hva_range_hv etc.). * While we have the rmap lock, code running on other CPUs * cannot finish unmapping the host real page that backs * this guest real page, so we are OK to access the host diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9cb9448163c4..4863ab81f663 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4351,7 +4351,6 @@ static struct kvmppc_ops kvm_ops_hv = { .flush_memslot = kvmppc_core_flush_memslot_hv, .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, .commit_memory_region = kvmppc_core_commit_memory_region_hv, - .unmap_hva = kvm_unmap_hva_hv, .unmap_hva_range = kvm_unmap_hva_range_hv, .age_hva = kvm_age_hva_hv, .test_age_hva = kvm_test_age_hva_hv, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 3ae752314b34..d3f304d06adf 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -277,15 +277,6 @@ static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, } } -static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva) -{ - trace_kvm_unmap_hva(hva); - - do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); - - return 0; -} - static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, unsigned long end) { @@ -1773,7 +1764,6 @@ static struct kvmppc_ops kvm_ops_pr = { .flush_memslot = kvmppc_core_flush_memslot_pr, .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, .commit_memory_region = kvmppc_core_commit_memory_region_pr, - .unmap_hva = kvm_unmap_hva_pr, .unmap_hva_range = kvm_unmap_hva_range_pr, .age_hva = kvm_age_hva_pr, .test_age_hva = kvm_test_age_hva_pr, diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 423b21393bc9..c878b4ffb86f 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -724,7 +724,7 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type, /************* MMU Notifiers *************/ -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { trace_kvm_unmap_hva(hva); diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h index 85785a370c0e..2f9a8829552b 100644 --- a/arch/powerpc/kvm/trace_pr.h +++ b/arch/powerpc/kvm/trace_pr.h @@ -254,21 +254,6 @@ TRACE_EVENT(kvm_exit, ) ); -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("unmap hva 0x%lx\n", __entry->hva) -); - #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ -- cgit v1.2.3 From c4c8a7643e74ebd7f2cfa80807562f16bb58c1d9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 23 Feb 2018 21:40:49 +1100 Subject: KVM: PPC: Book3S HV: Radix page fault handler optimizations This improves the handling of transparent huge pages in the radix hypervisor page fault handler. Previously, if a small page is faulted in to a 2MB region of guest physical space, that means that there is a page table pointer at the PMD level, which could never be replaced by a leaf (2MB) PMD entry. This adds the code to clear the PMD, invlidate the page walk cache and free the page table page in this situation, so that the leaf PMD entry can be created. This also adds code to check whether a PMD or PTE being inserted is the same as is already there (because of a race with another CPU that faulted on the same page) and if so, we don't replace the existing entry, meaning that we don't invalidate the PTE or PMD and do a TLB invalidation. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 42 ++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 5cb4e4687107..ed62164f8474 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -160,6 +160,17 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, asm volatile("ptesync": : :"memory"); } +static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr) +{ + unsigned long rb = 0x2 << PPC_BITLSHIFT(53); /* IS = 2 */ + + asm volatile("ptesync": : :"memory"); + /* RIC=1 PRS=0 R=1 IS=2 */ + asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1) + : : "r" (rb), "r" (kvm->arch.lpid) : "memory"); + asm volatile("ptesync": : :"memory"); +} + unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr, unsigned long set, unsigned long addr, unsigned int shift) @@ -261,6 +272,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ret = -EAGAIN; goto out_unlock; } + /* Check if we raced and someone else has set the same thing */ + if (level == 1 && pmd_raw(*pmd) == pte_raw(pte)) { + ret = 0; + goto out_unlock; + } /* Valid 2MB page here already, remove it */ old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), ~0UL, 0, lgpa, PMD_SHIFT); @@ -275,12 +291,13 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, } } else if (level == 1 && !pmd_none(*pmd)) { /* - * There's a page table page here, but we wanted - * to install a large page. Tell the caller and let - * it try installing a normal page if it wants. + * There's a page table page here, but we wanted to + * install a large page, so remove and free the page + * table page. new_ptep will be NULL since level == 1. */ - ret = -EBUSY; - goto out_unlock; + new_ptep = pte_offset_kernel(pmd, 0); + pmd_clear(pmd); + kvmppc_radix_flush_pwc(kvm, gpa); } if (level == 0) { if (pmd_none(*pmd)) { @@ -291,6 +308,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, } ptep = pte_offset_kernel(pmd, gpa); if (pte_present(*ptep)) { + /* Check if someone else set the same thing */ + if (pte_raw(*ptep) == pte_raw(pte)) { + ret = 0; + goto out_unlock; + } /* PTE was previously valid, so invalidate it */ old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, gpa, 0); @@ -469,16 +491,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Allocate space in the tree and write the PTE */ ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); - if (ret == -EBUSY) { - /* - * There's already a PMD where wanted to install a large page; - * for now, fall back to installing a small page. - */ - level = 0; - pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1); - pte = pfn_pte(pfn, __pgprot(pgflags)); - ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); - } if (page) { if (!ret && (pgflags & _PAGE_WRITE)) -- cgit v1.2.3 From f7caf712d885713986baeac86b1b64bcbd9dcd91 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 24 Feb 2018 20:08:51 +1100 Subject: KVM: PPC: Book3S HV: Streamline setting of reference and change bits When using the radix MMU, we can get hypervisor page fault interrupts with the DSISR_SET_RC bit set in DSISR/HSRR1, indicating that an attempt to set the R (reference) or C (change) bit in a PTE atomically failed. Previously we would find the corresponding Linux PTE and check the permission and dirty bits there, but this is not really necessary since we only need to do what the hardware was trying to do, namely set R or C atomically. This removes the code that reads the Linux PTE and just update the partition-scoped PTE, having first checked that it is still present, and if the access is a write, that the PTE still has write permission. Furthermore, we now check whether any other relevant bits are set in DSISR, and if there are, then we proceed with the rest of the function in order to handle whatever condition they represent, instead of returning to the guest as we did previously. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 52 +++++++++++++--------------------- 1 file changed, 19 insertions(+), 33 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index ed62164f8474..f783b067e5ac 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -345,7 +345,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; struct page *page = NULL, *pages[1]; - long ret, npages, ok; + long ret, npages; unsigned int writing; struct vm_area_struct *vma; unsigned long flags; @@ -397,43 +397,29 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (dsisr & DSISR_SET_RC) { /* * Need to set an R or C bit in the 2nd-level tables; - * if the relevant bits aren't already set in the linux - * page tables, fall through to do the gup_fast to - * set them in the linux page tables too. + * since we are just helping out the hardware here, + * it is sufficient to do what the hardware does. */ - ok = 0; pgflags = _PAGE_ACCESSED; if (writing) pgflags |= _PAGE_DIRTY; - local_irq_save(flags); - ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); - if (ptep) { - pte = READ_ONCE(*ptep); - if (pte_present(pte) && - (pte_val(pte) & pgflags) == pgflags) - ok = 1; - } - local_irq_restore(flags); - if (ok) { - spin_lock(&kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { - spin_unlock(&kvm->mmu_lock); - return RESUME_GUEST; - } - /* - * We are walking the secondary page table here. We can do this - * without disabling irq. - */ - ptep = __find_linux_pte(kvm->arch.pgtable, - gpa, NULL, &shift); - if (ptep && pte_present(*ptep)) { - kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, - gpa, shift); - spin_unlock(&kvm->mmu_lock); - return RESUME_GUEST; - } - spin_unlock(&kvm->mmu_lock); + /* + * We are walking the secondary page table here. We can do this + * without disabling irq. + */ + spin_lock(&kvm->mmu_lock); + ptep = __find_linux_pte(kvm->arch.pgtable, + gpa, NULL, &shift); + if (ptep && pte_present(*ptep) && + (!writing || pte_write(*ptep))) { + kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, + gpa, shift); + dsisr &= ~DSISR_SET_RC; } + spin_unlock(&kvm->mmu_lock); + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | + DSISR_PROTFAULT | DSISR_SET_RC))) + return RESUME_GUEST; } ret = -EFAULT; -- cgit v1.2.3 From 58c5c276b4c2ceb2b02ecd959ad9784b997d4332 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 24 Feb 2018 20:14:37 +1100 Subject: KVM: PPC: Book3S HV: Handle 1GB pages in radix page fault handler This adds code to the radix hypervisor page fault handler to handle the case where the guest memory is backed by 1GB hugepages, and put them into the partition-scoped radix tree at the PUD level. The code is essentially analogous to the code for 2MB pages. This also rearranges kvmppc_create_pte() to make it easier to follow. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 129 ++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 36 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index f783b067e5ac..05acc67e0eb2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -150,7 +150,9 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, { int psize = MMU_BASE_PSIZE; - if (pshift >= PMD_SHIFT) + if (pshift >= PUD_SHIFT) + psize = MMU_PAGE_1G; + else if (pshift >= PMD_SHIFT) psize = MMU_PAGE_2M; addr &= ~0xfffUL; addr |= mmu_psize_defs[psize].ap << 5; @@ -231,9 +233,9 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, new_pud = pud_alloc_one(kvm->mm, gpa); pmd = NULL; - if (pud && pud_present(*pud)) + if (pud && pud_present(*pud) && !pud_huge(*pud)) pmd = pmd_offset(pud, gpa); - else + else if (level <= 1) new_pmd = pmd_alloc_one(kvm->mm, gpa); if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) @@ -254,6 +256,50 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, new_pud = NULL; } pud = pud_offset(pgd, gpa); + if (pud_huge(*pud)) { + unsigned long hgpa = gpa & PUD_MASK; + + /* + * If we raced with another CPU which has just put + * a 1GB pte in after we saw a pmd page, try again. + */ + if (level <= 1 && !new_pmd) { + ret = -EAGAIN; + goto out_unlock; + } + /* Check if we raced and someone else has set the same thing */ + if (level == 2 && pud_raw(*pud) == pte_raw(pte)) { + ret = 0; + goto out_unlock; + } + /* Valid 1GB page here already, remove it */ + old = kvmppc_radix_update_pte(kvm, (pte_t *)pud, + ~0UL, 0, hgpa, PUD_SHIFT); + kvmppc_radix_tlbie_page(kvm, hgpa, PUD_SHIFT); + if (old & _PAGE_DIRTY) { + unsigned long gfn = hgpa >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + memslot = gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, + gfn, PUD_SIZE); + } + } + if (level == 2) { + if (!pud_none(*pud)) { + /* + * There's a page table page here, but we wanted to + * install a large page, so remove and free the page + * table page. new_pmd will be NULL since level == 2. + */ + new_pmd = pmd_offset(pud, 0); + pud_clear(pud); + kvmppc_radix_flush_pwc(kvm, gpa); + } + kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); + ret = 0; + goto out_unlock; + } if (pud_none(*pud)) { if (!new_pmd) goto out_unlock; @@ -289,41 +335,43 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, kvmppc_update_dirty_map(memslot, gfn, PMD_SIZE); } - } else if (level == 1 && !pmd_none(*pmd)) { - /* - * There's a page table page here, but we wanted to - * install a large page, so remove and free the page - * table page. new_ptep will be NULL since level == 1. - */ - new_ptep = pte_offset_kernel(pmd, 0); - pmd_clear(pmd); - kvmppc_radix_flush_pwc(kvm, gpa); } - if (level == 0) { - if (pmd_none(*pmd)) { - if (!new_ptep) - goto out_unlock; - pmd_populate(kvm->mm, pmd, new_ptep); - new_ptep = NULL; - } - ptep = pte_offset_kernel(pmd, gpa); - if (pte_present(*ptep)) { - /* Check if someone else set the same thing */ - if (pte_raw(*ptep) == pte_raw(pte)) { - ret = 0; - goto out_unlock; - } - /* PTE was previously valid, so invalidate it */ - old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, - 0, gpa, 0); - kvmppc_radix_tlbie_page(kvm, gpa, 0); - if (old & _PAGE_DIRTY) - mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + if (level == 1) { + if (!pmd_none(*pmd)) { + /* + * There's a page table page here, but we wanted to + * install a large page, so remove and free the page + * table page. new_ptep will be NULL since level == 1. + */ + new_ptep = pte_offset_kernel(pmd, 0); + pmd_clear(pmd); + kvmppc_radix_flush_pwc(kvm, gpa); } - kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); - } else { kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); + ret = 0; + goto out_unlock; + } + if (pmd_none(*pmd)) { + if (!new_ptep) + goto out_unlock; + pmd_populate(kvm->mm, pmd, new_ptep); + new_ptep = NULL; + } + ptep = pte_offset_kernel(pmd, gpa); + if (pte_present(*ptep)) { + /* Check if someone else set the same thing */ + if (pte_raw(*ptep) == pte_raw(pte)) { + ret = 0; + goto out_unlock; + } + /* PTE was previously valid, so invalidate it */ + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, + 0, gpa, 0); + kvmppc_radix_tlbie_page(kvm, gpa, 0); + if (old & _PAGE_DIRTY) + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); } + kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); ret = 0; out_unlock: @@ -446,8 +494,13 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pfn = page_to_pfn(page); if (PageCompound(page)) { pte_size <<= compound_order(compound_head(page)); - /* See if we can insert a 2MB large-page PTE here */ - if (pte_size >= PMD_SIZE && + /* See if we can insert a 1GB or 2MB large PTE here */ + if (pte_size >= PUD_SIZE && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); + } else if (pte_size >= PMD_SIZE && (gpa & (PMD_SIZE - PAGE_SIZE)) == (hva & (PMD_SIZE - PAGE_SIZE))) { level = 1; @@ -657,6 +710,10 @@ void kvmppc_free_radix(struct kvm *kvm) for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) { if (!pud_present(*pud)) continue; + if (pud_huge(*pud)) { + pud_clear(pud); + continue; + } pmd = pmd_offset(pud, 0); for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { if (pmd_is_leaf(*pmd)) { -- cgit v1.2.3 From 31c8b0d0694a1f7e3b46df0d1341a874ecb5e0de Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 1 Mar 2018 15:14:02 +1100 Subject: KVM: PPC: Book3S HV: Use __gfn_to_pfn_memslot() in page fault handler This changes the hypervisor page fault handler for radix guests to use the generic KVM __gfn_to_pfn_memslot() function instead of using get_user_pages_fast() and then handling the case of VM_PFNMAP vmas specially. The old code missed the case of VM_IO vmas; with this change, VM_IO vmas will now be handled correctly by code within __gfn_to_pfn_memslot. Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses __get_user_pages_fast for the initial lookup in the cases where either atomic or async is set. Since we are not setting either atomic or async, we do our own __get_user_pages_fast first, for now. This also adds code to check for the KVM_MEM_READONLY flag on the memslot. If it is set and this is a write access, we synthesize a data storage interrupt for the guest. In the case where the page is not normal RAM (i.e. page == NULL in kvmppc_book3s_radix_page_fault(), we read the PTE from the Linux page tables because we need the mapping attribute bits as well as the PFN. (The mapping attribute bits indicate whether accesses have to be non-cacheable and/or guarded.) Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 148 ++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 60 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 05acc67e0eb2..0590f1667607 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -392,11 +392,11 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long mmu_seq, pte_size; unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; - struct page *page = NULL, *pages[1]; - long ret, npages; - unsigned int writing; - struct vm_area_struct *vma; - unsigned long flags; + struct page *page = NULL; + long ret; + bool writing; + bool upgrade_write = false; + bool *upgrade_p = &upgrade_write; pte_t pte, *ptep; unsigned long pgflags; unsigned int shift, level; @@ -436,12 +436,17 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr & DSISR_ISSTORE); } - /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; - smp_rmb(); - writing = (dsisr & DSISR_ISSTORE) != 0; - hva = gfn_to_hva_memslot(memslot, gfn); + if (memslot->flags & KVM_MEM_READONLY) { + if (writing) { + /* give the guest a DSI */ + dsisr = DSISR_ISSTORE | DSISR_PROTFAULT; + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + upgrade_p = NULL; + } + if (dsisr & DSISR_SET_RC) { /* * Need to set an R or C bit in the 2nd-level tables; @@ -470,69 +475,92 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; } - ret = -EFAULT; - pfn = 0; - pte_size = PAGE_SIZE; - pgflags = _PAGE_READ | _PAGE_EXEC; - level = 0; - npages = get_user_pages_fast(hva, 1, writing, pages); - if (npages < 1) { - /* Check if it's an I/O mapping */ - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); - if (vma && vma->vm_start <= hva && hva < vma->vm_end && - (vma->vm_flags & VM_PFNMAP)) { - pfn = vma->vm_pgoff + - ((hva - vma->vm_start) >> PAGE_SHIFT); - pgflags = pgprot_val(vma->vm_page_prot); - } - up_read(¤t->mm->mmap_sem); - if (!pfn) - return -EFAULT; - } else { - page = pages[0]; + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* + * Do a fast check first, since __gfn_to_pfn_memslot doesn't + * do it with !atomic && !async, which is how we call it. + * We always ask for write permission since the common case + * is that the page is writable. + */ + hva = gfn_to_hva_memslot(memslot, gfn); + if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { pfn = page_to_pfn(page); - if (PageCompound(page)) { - pte_size <<= compound_order(compound_head(page)); - /* See if we can insert a 1GB or 2MB large PTE here */ - if (pte_size >= PUD_SIZE && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); - } else if (pte_size >= PMD_SIZE && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); - } + upgrade_write = true; + } else { + /* Call KVM generic code to do the slow-path check */ + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + writing, upgrade_p); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + page = NULL; + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageReserved(page)) + page = NULL; } - /* See if we can provide write access */ - if (writing) { - pgflags |= _PAGE_WRITE; - } else { - local_irq_save(flags); - ptep = find_current_mm_pte(current->mm->pgd, - hva, NULL, NULL); - if (ptep && pte_write(*ptep)) - pgflags |= _PAGE_WRITE; - local_irq_restore(flags); + } + + /* See if we can insert a 1GB or 2MB large PTE here */ + level = 0; + if (page && PageCompound(page)) { + pte_size = PAGE_SIZE << compound_order(compound_head(page)); + if (pte_size >= PUD_SIZE && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); + } else if (pte_size >= PMD_SIZE && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; + pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); } } /* * Compute the PTE value that we need to insert. */ - pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED; - if (pgflags & _PAGE_WRITE) - pgflags |= _PAGE_DIRTY; - pte = pfn_pte(pfn, __pgprot(pgflags)); + if (page) { + pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | + _PAGE_ACCESSED; + if (writing || upgrade_write) + pgflags |= _PAGE_WRITE | _PAGE_DIRTY; + pte = pfn_pte(pfn, __pgprot(pgflags)); + } else { + /* + * Read the PTE from the process' radix tree and use that + * so we get the attribute bits. + */ + local_irq_disable(); + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + pte = *ptep; + local_irq_enable(); + if (shift == PUD_SHIFT && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + } else if (shift == PMD_SHIFT && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; + } else if (shift && shift != PAGE_SHIFT) { + /* Adjust PFN */ + unsigned long mask = (1ul << shift) - PAGE_SIZE; + pte = __pte(pte_val(pte) | (hva & mask)); + } + if (!(writing || upgrade_write)) + pte = __pte(pte_val(pte) & ~ _PAGE_WRITE); + pte = __pte(pte_val(pte) | _PAGE_EXEC); + } /* Allocate space in the tree and write the PTE */ ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); if (page) { - if (!ret && (pgflags & _PAGE_WRITE)) + if (!ret && (pte_val(pte) & _PAGE_WRITE)) set_page_dirty_lock(page); put_page(page); } -- cgit v1.2.3