From 7b424ffcd45821600312ed4c794c21f7805e79d4 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 13 Sep 2023 16:56:44 +0000 Subject: KVM: arm64: Don't use kerneldoc comment for arm64_check_features() A double-asterisk opening mark to the comment (i.e. '/**') indicates a comment block is in the kerneldoc format. There's automation in place to validate that kerneldoc blocks actually adhere to the formatting rules. The function comment for arm64_check_features() isn't kerneldoc; use a 'regular' comment to silence automation warnings. Link: https://lore.kernel.org/all/202309112251.e25LqfcK-lkp@intel.com/ Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20230913165645.2319017-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e92ec810d449..818a52e257ed 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1228,7 +1228,7 @@ static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, return arm64_ftr_safe_value(&kvm_ftr, new, cur); } -/** +/* * arm64_check_features() - Check if a feature register value constitutes * a subset of features indicated by the idreg's KVM sanitised limit. * -- cgit v1.2.3 From ec1c3b9ff16082f880b304be40992568f4eee6a7 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 20 Sep 2023 08:01:32 +0000 Subject: arm64: tlbflush: Rename MAX_TLBI_OPS Perhaps unsurprisingly, I-cache invalidations suffer from performance issues similar to TLB invalidations on certain systems. TLB and I-cache maintenance all result in DVM on the mesh, which is where the real bottleneck lies. Rename the heuristic to point the finger at DVM, such that it may be reused for limiting I-cache invalidations. Reviewed-by: Gavin Shan Tested-by: Gavin Shan Acked-by: Will Deacon Link: https://lore.kernel.org/r/20230920080133.944717-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/tlbflush.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index b149cf9f91bc..3431d37e5054 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -333,7 +333,7 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * This is meant to avoid soft lock-ups on large TLB flushing ranges and not * necessarily a performance improvement. */ -#define MAX_TLBI_OPS PTRS_PER_PTE +#define MAX_DVM_OPS PTRS_PER_PTE /* * __flush_tlb_range_op - Perform TLBI operation upon a range @@ -413,12 +413,12 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, /* * When not uses TLB range ops, we can handle up to - * (MAX_TLBI_OPS - 1) pages; + * (MAX_DVM_OPS - 1) pages; * When uses TLB range ops, we can handle up to * (MAX_TLBI_RANGE_PAGES - 1) pages. */ if ((!system_supports_tlb_range() && - (end - start) >= (MAX_TLBI_OPS * stride)) || + (end - start) >= (MAX_DVM_OPS * stride)) || pages >= MAX_TLBI_RANGE_PAGES) { flush_tlb_mm(vma->vm_mm); return; @@ -451,7 +451,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end { unsigned long addr; - if ((end - start) > (MAX_TLBI_OPS * PAGE_SIZE)) { + if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) { flush_tlb_all(); return; } -- cgit v1.2.3 From 909b583f81b5bb5a398d4580543f59b908a86ccc Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 20 Sep 2023 08:01:33 +0000 Subject: KVM: arm64: Avoid soft lockups due to I-cache maintenance Gavin reports of soft lockups on his Ampere Altra Max machine when backing KVM guests with hugetlb pages. Upon further investigation, it was found that the system is unable to keep up with parallel I-cache invalidations done by KVM's stage-2 fault handler. This is ultimately an implementation problem. I-cache maintenance instructions are available at EL0, so nothing stops a malicious userspace from hammering a system with CMOs and cause it to fall over. "Fixing" this problem in KVM is nothing more than slapping a bandage over a much deeper problem. Anyway, the kernel already has a heuristic for limiting TLB invalidations to avoid soft lockups. Reuse that logic to limit I-cache CMOs done by KVM to map executable pages on systems without FEAT_DIC. While at it, restructure __invalidate_icache_guest_page() to improve readability and squeeze our new condition into the existing branching structure. Link: https://lore.kernel.org/kvmarm/20230904072826.1468907-1-gshan@redhat.com/ Reviewed-by: Gavin Shan Tested-by: Gavin Shan Link: https://lore.kernel.org/r/20230920080133.944717-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_mmu.h | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 96a80e8f6226..a425ecdd7be0 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -224,16 +224,41 @@ static inline void __clean_dcache_guest_page(void *va, size_t size) kvm_flush_dcache_to_poc(va, size); } +static inline size_t __invalidate_icache_max_range(void) +{ + u8 iminline; + u64 ctr; + + asm volatile(ALTERNATIVE_CB("movz %0, #0\n" + "movk %0, #0, lsl #16\n" + "movk %0, #0, lsl #32\n" + "movk %0, #0, lsl #48\n", + ARM64_ALWAYS_SYSTEM, + kvm_compute_final_ctr_el0) + : "=r" (ctr)); + + iminline = SYS_FIELD_GET(CTR_EL0, IminLine, ctr) + 2; + return MAX_DVM_OPS << iminline; +} + static inline void __invalidate_icache_guest_page(void *va, size_t size) { - if (icache_is_aliasing()) { - /* any kind of VIPT cache */ + /* + * VPIPT I-cache maintenance must be done from EL2. See comment in the + * nVHE flavor of __kvm_tlb_flush_vmid_ipa(). + */ + if (icache_is_vpipt() && read_sysreg(CurrentEL) != CurrentEL_EL2) + return; + + /* + * Blow the whole I-cache if it is aliasing (i.e. VIPT) or the + * invalidation range exceeds our arbitrary limit on invadations by + * cache line. + */ + if (icache_is_aliasing() || size > __invalidate_icache_max_range()) icache_inval_all_pou(); - } else if (read_sysreg(CurrentEL) != CurrentEL_EL1 || - !icache_is_vpipt()) { - /* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */ + else icache_inval_pou((unsigned long)va, (unsigned long)va + size); - } } void kvm_set_way_flush(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From c04bf723ccd639250fa680a0fb4e1ac15bb84c3b Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 28 Sep 2023 18:32:03 +0100 Subject: KVM: arm64: Do not transfer page refcount for THP adjustment GUP affects a refcount common to all pages forming the THP. There is therefore no need to move the refcount from a tail to the head page. Under the hood it decrements and increments the same counter. Signed-off-by: Vincent Donnefort Reviewed-by: Gavin Shan Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230928173205.2826598-2-vdonnefort@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 587a104f66c3..de5e5148ef5d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1295,28 +1295,8 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, if (sz < PMD_SIZE) return PAGE_SIZE; - /* - * The address we faulted on is backed by a transparent huge - * page. However, because we map the compound huge page and - * not the individual tail page, we need to transfer the - * refcount to the head page. We have to be careful that the - * THP doesn't start to split while we are adjusting the - * refcounts. - * - * We are sure this doesn't happen, because mmu_invalidate_retry - * was successful and we are holding the mmu_lock, so if this - * THP is trying to split, it will be blocked in the mmu - * notifier before touching any of the pages, specifically - * before being able to call __split_huge_page_refcount(). - * - * We can therefore safely transfer the refcount from PG_tail - * to PG_head and switch the pfn from a tail page to the head - * page accordingly. - */ *ipap &= PMD_MASK; - kvm_release_pfn_clean(pfn); pfn &= ~(PTRS_PER_PMD - 1); - get_page(pfn_to_page(pfn)); *pfnp = pfn; return PMD_SIZE; -- cgit v1.2.3 From be097997a273259f1723baac5463cf19d8564efa Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 22 Sep 2023 22:32:29 +0000 Subject: KVM: arm64: Always invalidate TLB for stage-2 permission faults It is possible for multiple vCPUs to fault on the same IPA and attempt to resolve the fault. One of the page table walks will actually update the PTE and the rest will return -EAGAIN per our race detection scheme. KVM elides the TLB invalidation on the racing threads as the return value is nonzero. Before commit a12ab1378a88 ("KVM: arm64: Use local TLBI on permission relaxation") KVM always used broadcast TLB invalidations when handling permission faults, which had the convenient property of making the stage-2 updates visible to all CPUs in the system. However now we do a local invalidation, and TLBI elision leads to the vCPU thread faulting again on the stale entry. Remember that the architecture permits the TLB to cache translations that precipitate a permission fault. Invalidate the TLB entry responsible for the permission fault if the stage-2 descriptor has been relaxed, regardless of which thread actually did the job. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20230922223229.1608155-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index f155b8c9e98c..286888751793 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1314,7 +1314,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED); - if (!ret) + if (!ret || ret == -EAGAIN) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; } -- cgit v1.2.3 From d11974dc5f208ae1c0fe62a9bcb47c0cdcd7b081 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 26 Oct 2023 20:53:06 +0000 Subject: KVM: arm64: Add tracepoint for MMIO accesses where ISV==0 It is a pretty well known fact that KVM does not support MMIO emulation without valid instruction syndrome information (ESR_EL2.ISV == 0). The current kvm_pr_unimpl() is pretty useless, as it contains zero context to relate the event to a vCPU. Replace it with a precise tracepoint that dumps the relevant context so the user can make sense of what the guest is doing. Acked-by: Zenghui Yu Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20231026205306.3045075-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmio.c | 4 +++- arch/arm64/kvm/trace_arm.h | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 3dd38a151d2a..200c8019a82a 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -135,6 +135,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) * volunteered to do so, and bail out otherwise. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { + trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), + kvm_vcpu_get_hfar(vcpu), fault_ipa); + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; @@ -143,7 +146,6 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) return 0; } - kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n"); return -ENOSYS; } diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index 8ad53104934d..c18c1a95831e 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -136,6 +136,31 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); +TRACE_EVENT(kvm_mmio_nisv, + TP_PROTO(unsigned long vcpu_pc, unsigned long esr, + unsigned long far, unsigned long ipa), + TP_ARGS(vcpu_pc, esr, far, ipa), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, esr ) + __field( unsigned long, far ) + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->esr = esr; + __entry->far = far; + __entry->ipa = ipa; + ), + + TP_printk("ipa %#016lx, esr %#016lx, far %#016lx, pc %#016lx", + __entry->ipa, __entry->esr, + __entry->far, __entry->vcpu_pc) +); + + TRACE_EVENT(kvm_set_way_flush, TP_PROTO(unsigned long vcpu_pc, bool cache), TP_ARGS(vcpu_pc, cache), -- cgit v1.2.3