diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 317 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmutrace.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 51 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 44 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 75 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 5 |
11 files changed, 283 insertions, 248 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 33794379949e..593093b52395 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -335,12 +335,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } -static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception) -{ - return gpa; -} - static int is_cpuid_PSE36(void) { return 1; @@ -1454,7 +1448,7 @@ static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, { u64 *sptep; struct rmap_iterator iter; - int need_flush = 0; + bool need_flush = false; u64 new_spte; kvm_pfn_t new_pfn; @@ -1466,7 +1460,7 @@ restart: rmap_printk("spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); - need_flush = 1; + need_flush = true; if (pte_write(pte)) { pte_list_remove(kvm, rmap_head, sptep); @@ -1482,7 +1476,7 @@ restart: if (need_flush && kvm_available_flush_tlb_with_range()) { kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); - return 0; + return false; } return need_flush; @@ -1582,7 +1576,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; } @@ -1623,8 +1617,8 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) - return 1; - return 0; + return true; + return false; } #define RMAP_RECYCLE_THRESHOLD 1000 @@ -1936,7 +1930,11 @@ static void mmu_audit_disable(void) { } static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - return sp->role.invalid || + if (sp->role.invalid) + return true; + + /* TDP MMU pages due not use the MMU generation. */ + return !sp->tdp_mmu_page && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -2082,10 +2080,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = direct; - if (role.direct) - role.gpte_is_8_bytes = true; role.access = access; - if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { + if (role.has_4_byte_gpte) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -2173,10 +2169,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->shadow_root_level; - if (iterator->level == PT64_ROOT_4LEVEL && + if (iterator->level >= PT64_ROOT_4LEVEL && vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->direct_map) - --iterator->level; + iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { /* @@ -2561,10 +2557,10 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) return r; } -static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); - ++vcpu->kvm->stat.mmu_unsync; + ++kvm->stat.mmu_unsync; sp->unsync = 1; kvm_mmu_mark_parents_unsync(sp); @@ -2576,7 +2572,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, +int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch) { struct kvm_mmu_page *sp; @@ -2587,7 +2583,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_slot_page_track_is_active(vcpu, slot, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) return -EPERM; /* @@ -2596,7 +2592,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, * that case, KVM must complete emulation of the guest TLB flush before * allowing shadow pages to become unsync (writable by the guest). */ - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { if (!can_unsync) return -EPERM; @@ -2615,7 +2611,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, */ if (!locked) { locked = true; - spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + spin_lock(&kvm->arch.mmu_unsync_pages_lock); /* * Recheck after taking the spinlock, a different vCPU @@ -2630,10 +2626,10 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } WARN_ON(sp->role.level != PG_LEVEL_4K); - kvm_unsync_page(vcpu, sp); + kvm_unsync_page(kvm, sp); } if (locked) - spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + spin_unlock(&kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible @@ -3405,7 +3401,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; - int r = 0, i; + int r = 0, i, bkt; /* * Check if this is the first shadow root being allocated before @@ -3430,7 +3426,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { + kvm_for_each_memslot(slot, bkt, slots) { /* * Both of these functions are no-ops if the target is * already allocated, so unconditionally calling both @@ -3730,21 +3726,13 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, struct x86_exception *exception) -{ - if (exception) - exception->error_code = 0; - return vaddr; -} - -static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, - struct x86_exception *exception) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t vaddr, u32 access, + struct x86_exception *exception) { if (exception) exception->error_code = 0; - return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); + return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -3884,7 +3872,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_slot_page_track_is_active(vcpu, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) return true; return false; @@ -3976,6 +3964,34 @@ out_retry: return true; } +/* + * Returns true if the page fault is stale and needs to be retried, i.e. if the + * root was invalidated by a memslot update or a relevant mmu_notifier fired. + */ +static bool is_page_fault_stale(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, int mmu_seq) +{ + struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa); + + /* Special roots, e.g. pae_root, are not backed by shadow pages. */ + if (sp && is_obsolete_sp(vcpu->kvm, sp)) + return true; + + /* + * Roots without an associated shadow page are considered invalid if + * there is a pending request to free obsolete roots. The request is + * only a hint that the current root _may_ be obsolete and needs to be + * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a + * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs + * to reload even if no vCPU is actively using the root. + */ + if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu)) + return true; + + return fault->slot && + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); +} + static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); @@ -4013,8 +4029,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else write_lock(&vcpu->kvm->mmu_lock); - if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) + if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; + r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; @@ -4355,22 +4372,28 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, bool execonly) + u64 pa_bits_rsvd, bool execonly, int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); + u64 large_1g_rsvd = 0, large_2m_rsvd = 0; u64 bad_mt_xwr; + if (huge_page_level < PG_LEVEL_1G) + large_1g_rsvd = rsvd_bits(7, 7); + if (huge_page_level < PG_LEVEL_2M) + large_2m_rsvd = rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29); - rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ @@ -4386,10 +4409,11 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) + struct kvm_mmu *context, bool execonly, int huge_page_level) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, - vcpu->arch.reserved_gpa_bits, execonly); + vcpu->arch.reserved_gpa_bits, execonly, + huge_page_level); } static inline u64 reserved_hpa_bits(void) @@ -4465,7 +4489,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - reserved_hpa_bits(), false); + reserved_hpa_bits(), false, + max_huge_page_level); if (!shadow_me_mask) return; @@ -4485,7 +4510,8 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - reserved_hpa_bits(), execonly); + reserved_hpa_bits(), execonly, + max_huge_page_level); } #define BYTE_MASK(access) \ @@ -4682,6 +4708,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, /* PKEY and LA57 are active iff long mode is active. */ ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); + ext.efer_lma = ____is_efer_lma(regs); } ext.valid = 1; @@ -4733,7 +4760,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.base.ad_disabled = (shadow_accessed_mask == 0); role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; - role.base.gpte_is_8_bytes = true; + role.base.has_4_byte_gpte = false; return role; } @@ -4778,7 +4805,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); - role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs); + role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs); return role; } @@ -4854,7 +4881,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { .cr0 = cr0, - .cr4 = cr4, + .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; union kvm_mmu_role new_role; @@ -4877,7 +4904,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; role.base.level = level; - role.base.gpte_is_8_bytes = true; + role.base.has_4_byte_gpte = false; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; @@ -4892,7 +4919,8 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, } void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty, gpa_t new_eptp) + int huge_page_level, bool accessed_dirty, + gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); @@ -4918,8 +4946,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->direct_map = false; update_permission_bitmask(context, true); - update_pkru_bitmask(context); - reset_rsvds_bits_mask_ept(vcpu, context, execonly); + context->pkru_mask = 0; + reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -4983,13 +5011,13 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) - g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; + g_context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_long_mode(vcpu)) - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + g_context->gva_to_gpa = paging64_gva_to_gpa; else if (is_pae(vcpu)) - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + g_context->gva_to_gpa = paging64_gva_to_gpa; else - g_context->gva_to_gpa = paging32_gva_to_gpa_nested; + g_context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, g_context); } @@ -5024,6 +5052,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. + * + * Correctly handling multiple vCPU models with respect to paging and + * physical address properties) in a single VM would require tracking + * all relevant CPUID information in kvm_mmu_page_role. That is very + * undesirable as it would increase the memory requirements for + * gfn_track (see struct kvm_mmu_page_role comments). For now that + * problem is swept under the rug; KVM's CPUID API is horrific and + * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.mmu_role.ext.valid = 0; vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; @@ -5031,24 +5067,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise - * sweep the problem under the rug. - * - * KVM's horrific CPUID ABI makes the problem all but impossible to - * solve, as correctly handling multiple vCPU models (with respect to - * paging and physical address properties) in a single VM would require - * tracking all relevant CPUID information in kvm_mmu_page_role. That - * is very undesirable as it would double the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments), and in practice - * no sane VMM mucks with the core vCPU model on the fly. + * Changing guest CPUID after KVM_RUN is forbidden, see the comment in + * kvm_arch_vcpu_ioctl(). */ - if (vcpu->arch.last_vmentry_cpu != -1) { - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); - } + KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -5160,7 +5182,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, gpa, bytes, sp->role.word); offset = offset_in_page(gpa); - pte_size = sp->role.gpte_is_8_bytes ? 8 : 4; + pte_size = sp->role.has_4_byte_gpte ? 4 : 8; /* * Sometimes, the OS only writes the last one bytes to update status @@ -5184,7 +5206,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) page_offset = offset_in_page(gpa); level = sp->role.level; *nspte = 1; - if (!sp->role.gpte_is_8_bytes) { + if (sp->role.has_4_byte_gpte) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map @@ -5368,7 +5390,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); + kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5496,10 +5518,13 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->root_hpa = INVALID_PAGE; mmu->root_pgd = 0; - mmu->translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ + if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) + return 0; + /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU @@ -5509,7 +5534,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * generally doesn't use PAE paging and can skip allocating the PDP * table. The main exception, handled here, is SVM's 32-bit NPT. The * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit - * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots(). + * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). */ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; @@ -5554,8 +5579,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; @@ -5714,6 +5737,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; + struct kvm_memslot_iter iter; bool flush = false; gfn_t start, end; int i; @@ -5723,13 +5747,16 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { + + kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { + memslot = iter.slot; start = max(gfn_start, memslot->base_gfn); end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) + if (WARN_ON_ONCE(start >= end)) continue; flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } @@ -5747,6 +5774,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) bool flush; int i; + if (WARN_ON_ONCE(gfn_end <= gfn_start)) + return; + write_lock(&kvm->mmu_lock); kvm_inc_notifier_count(kvm, gfn_start, gfn_end); @@ -5796,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } /* - * We can flush all the TLBs out of the mmu lock without TLB - * corruption since we just change the spte from writable to - * readonly so that we only need to care the case of changing - * spte from present to present (changing the spte from present - * to nonpresent will flush all the TLBs immediately), in other - * words, the only case we care is mmu_spte_update() where we - * have checked Host-writable | MMU-writable instead of - * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK - * anymore. + * Flush TLBs if any SPTEs had to be write-protected to ensure that + * guest writes are reflected in the dirty bitmap before the memslot + * update completes, i.e. before enabling dirty logging is visible to + * userspace. + * + * Perform the TLB flush outside the mmu_lock to reduce the amount of + * time the lock is held. However, this does mean that another CPU can + * now grab mmu_lock and encounter a write-protected SPTE while CPUs + * still have a writable mapping for the associated GFN in their TLB. + * + * This is safe but requires KVM to be careful when making decisions + * based on the write-protection status of an SPTE. Specifically, KVM + * also write-protects SPTEs to monitor changes to guest page tables + * during shadow paging, and must guarantee no CPUs can write to those + * page before the lock is dropped. As mentioned in the previous + * paragraph, a write-protected SPTE is no guarantee that CPU cannot + * perform writes. So to determine if a TLB flush is truly required, KVM + * will clear a separate software-only bit (MMU-writable) and skip the + * flush if-and-only-if this bit was already clear. + * + * See DEFAULT_SPTE_MMU_WRITEABLE for more details. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5853,8 +5895,6 @@ restart: void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* @@ -5862,17 +5902,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, * logging at a 4k granularity and never creates collapsible * 2m SPTEs during dirty logging. */ - flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - if (flush) + if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); } } @@ -6141,30 +6178,6 @@ out: return ret; } -/* - * Calculate mmu pages needed for kvm. - */ -unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) -{ - unsigned long nr_mmu_pages; - unsigned long nr_pages = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int i; - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - - kvm_for_each_memslot(memslot, slots) - nr_pages += memslot->npages; - } - - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); - - return nr_mmu_pages; -} - void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); @@ -6181,23 +6194,46 @@ void kvm_mmu_module_exit(void) mmu_audit_disable(); } +/* + * Calculate the effective recovery period, accounting for '0' meaning "let KVM + * select a halving time of 1 hour". Returns true if recovery is enabled. + */ +static bool calc_nx_huge_pages_recovery_period(uint *period) +{ + /* + * Use READ_ONCE to get the params, this may be called outside of the + * param setters, e.g. by the kthread to compute its next timeout. + */ + bool enabled = READ_ONCE(nx_huge_pages); + uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + + if (!enabled || !ratio) + return false; + + *period = READ_ONCE(nx_huge_pages_recovery_period_ms); + if (!*period) { + /* Make sure the period is not less than one second. */ + ratio = min(ratio, 3600u); + *period = 60 * 60 * 1000 / ratio; + } + return true; +} + static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { bool was_recovery_enabled, is_recovery_enabled; uint old_period, new_period; int err; - was_recovery_enabled = nx_huge_pages_recovery_ratio; - old_period = nx_huge_pages_recovery_period_ms; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); if (err) return err; - is_recovery_enabled = nx_huge_pages_recovery_ratio; - new_period = nx_huge_pages_recovery_period_ms; + is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); - if (READ_ONCE(nx_huge_pages) && is_recovery_enabled && + if (is_recovery_enabled && (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; @@ -6261,18 +6297,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) static long get_nx_lpage_recovery_timeout(u64 start_time) { - uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - uint period = READ_ONCE(nx_huge_pages_recovery_period_ms); + bool enabled; + uint period; - if (!period && ratio) { - /* Make sure the period is not less than one second. */ - ratio = min(ratio, 3600u); - period = 60 * 60 * 1000 / ratio; - } + enabled = calc_nx_huge_pages_recovery_period(&period); - return READ_ONCE(nx_huge_pages) && ratio - ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; + return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; } static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 52c6527b1a06..da6166b5c377 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -104,7 +104,7 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) return kvm_mmu_role_as_id(sp->role); } -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -112,13 +112,12 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) * on write protection to record dirty pages, which bypasses PML, since * writes now result in a vmexit. Note, the check on CPU dirty logging * being enabled is mandatory as the bits used to denote WP-only SPTEs - * are reserved for NPT w/ PAE (32-bit KVM). + * are reserved for PAE paging (32-bit KVM). */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu && - kvm_x86_ops.cpu_dirty_log_size; + return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, +int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index b8151bbca36a..de5e8e4e1aa7 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -35,7 +35,7 @@ " %snxe %sad root %u %s%c", \ __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ - role.gpte_is_8_bytes ? 8 : 4, \ + role.has_4_byte_gpte ? 4 : 8, \ role.quadrant, \ role.direct ? " direct" : "", \ access_str[role.access], \ diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index cc4eb5b7fb76..68eb1fb548b6 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -173,9 +173,9 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode) +bool kvm_slot_page_track_is_active(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t gfn, enum kvm_page_track_mode mode) { int index; @@ -186,7 +186,7 @@ bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, return false; if (mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(vcpu->kvm)) + !kvm_page_track_write_tracking_enabled(kvm)) return false; index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f87d36898c44..5b5bdac97c7b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -403,9 +403,8 @@ retry_walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - nested_access, - &walker->fault); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), + nested_access, &walker->fault); /* * FIXME: This can happen if emulation (for of an INS/OUTS @@ -467,7 +466,7 @@ retry_walk: if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == UNMAPPED_GVA) return 0; @@ -547,16 +546,6 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } -#if PTTYPE != PTTYPE_EPT -static int FNAME(walk_addr_nested)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - u32 access) -{ - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, - addr, access); -} -#endif - static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte, bool no_dirty_log) @@ -911,7 +900,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) + + if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); @@ -999,50 +989,29 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t addr, u32 access, struct x86_exception *exception) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, addr, access); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= addr & ~PAGE_MASK; - } else if (exception) - *exception = walker.fault; - - return gpa; -} - -#if PTTYPE != PTTYPE_EPT -/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, - struct x86_exception *exception) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - #ifndef CONFIG_X86_64 /* A 64-bit GVA should be impossible on 32-bit KVM. */ - WARN_ON_ONCE(vaddr >> 32); + WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); #endif - r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); + r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; + gpa |= addr & ~PAGE_MASK; } else if (exception) *exception = walker.fault; return gpa; } -#endif /* * Using the cached information from sp->gfns is safe because: diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 0c76c45fdb68..73cfe62fdad1 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -16,6 +16,7 @@ #include "spte.h" #include <asm/e820/api.h> +#include <asm/memtype.h> #include <asm/vmx.h> static bool __read_mostly enable_mmio_caching = true; @@ -90,7 +91,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct kvm_memory_slot *slot, + const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte) @@ -101,7 +102,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) + else if (kvm_mmu_page_ad_need_write_protect(sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; /* @@ -161,7 +162,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. */ - if (mmu_try_to_unsync_pages(vcpu, slot, gfn, can_unsync, prefetch)) { + if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); wrprot = true; @@ -215,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~shadow_host_writable_mask; + new_spte &= ~shadow_mmu_writable_mask; new_spte = mark_spte_for_access_track(new_spte); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index cc432f9a966b..be6a007a4af3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) -/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ -#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) -#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) - /* * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the @@ -79,6 +75,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); /* + * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits + * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs + * that map guest pages in read-only memslots and read-only VMAs. + * + * Invariants: + * - If Host-writable is clear, PT_WRITABLE_MASK must be clear. + * + * + * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU + * allows writes to the guest page mapped by the SPTE. This bit is cleared when + * the guest page mapped by the SPTE contains a page table that is being + * monitored for shadow paging. In this case the SPTE can only be made writable + * by unsyncing the shadow page under the mmu_lock. + * + * Invariants: + * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear. + * - If MMU-writable is set, Host-writable must be set. + * + * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared + * to track writes for dirty logging. For such SPTEs, KVM will locklessly set + * PT_WRITABLE_MASK upon the next write from the guest and record the write in + * the dirty log (see fast_page_fault()). + */ + +/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) + +/* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care * to not overlap the A/D type mask or the saved access bits of access-tracked * SPTEs when A/D bits are disabled. @@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, static inline bool spte_can_locklessly_be_made_writable(u64 spte) { - return (spte & shadow_host_writable_mask) && - (spte & shadow_mmu_writable_mask); + if (spte & shadow_mmu_writable_mask) { + WARN_ON_ONCE(!(spte & shadow_host_writable_mask)); + return true; + } + + WARN_ON_ONCE(spte & PT_WRITABLE_MASK); + return false; } static inline u64 get_mmio_spte_generation(u64 spte) @@ -330,7 +360,7 @@ static inline u64 get_mmio_spte_generation(u64 spte) } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct kvm_memory_slot *slot, + const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index b3ed302c1a35..caa96c270b95 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) */ void tdp_iter_restart(struct tdp_iter *iter) { + iter->yielded = false; iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; @@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter) */ void tdp_iter_next(struct tdp_iter *iter) { + if (iter->yielded) { + tdp_iter_restart(iter); + return; + } + if (try_step_down(iter)) return; diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1748b988d3a..e19cabbcb65c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -45,6 +45,12 @@ struct tdp_iter { * iterator walks off the end of the paging structure. */ bool valid; + /* + * True if KVM dropped mmu_lock and yielded in the middle of a walk, in + * which case tdp_iter_next() needs to restart the walk at the root + * level instead of advancing to the next entry. + */ + bool yielded; }; /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a54c3491af42..bc9e3553fba2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -165,7 +165,7 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = true; - role.gpte_is_8_bytes = true; + role.has_4_byte_gpte = false; role.access = ACC_ALL; role.ad_disabled = !shadow_accessed_mask; @@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; - u64 old_child_spte; - u64 *sptep; - gfn_t gfn; int i; trace_kvm_mmu_prepare_zap_page(sp); @@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = rcu_dereference(pt) + i; - gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 *sptep = rcu_dereference(pt) + i; + gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 old_child_spte; if (shared) { /* @@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, shared); } - kvm_flush_remote_tlbs_with_address(kvm, gfn, + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -504,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -577,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -642,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, * If this function should yield and flush is set, it will perform a remote * TLB flush before yielding. * - * If this function yields, it will also reset the tdp_iter's walk over the - * paging structure and the calling function should skip to the next - * iteration to allow the iterator to continue its traversal from the - * paging structure root. + * If this function yields, iter->yielded is set and the caller must skip to + * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk + * over the paging structures to allow the iterator to continue its traversal + * from the paging structure root. * - * Return true if this function yielded and the iterator's traversal was reset. - * Return false if a yield was not needed. + * Returns true if this function yielded. */ -static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, - struct tdp_iter *iter, bool flush, - bool shared) +static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, + struct tdp_iter *iter, + bool flush, bool shared) { + WARN_ON(iter->yielded); + /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) return false; @@ -673,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_restart(iter); - - return true; + iter->yielded = true; } - return false; + return iter->yielded; } /* @@ -1033,9 +1034,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) - flush |= zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush, false); + for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) + flush = zap_gfn_range(kvm, root, range->start, range->end, + range->may_block, flush, false); return flush; } @@ -1364,10 +1365,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. */ -static bool zap_collapsible_spte_range(struct kvm *kvm, +static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot, - bool flush) + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; @@ -1378,10 +1378,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, tdp_root_for_each_pte(iter, root, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { - flush = false; + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - } if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) @@ -1393,6 +1391,7 @@ retry: pfn, PG_LEVEL_NUM)) continue; + /* Note, a successful atomic zap also does a remote TLB flush. */ if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { /* * The iter must explicitly re-read the SPTE because @@ -1401,30 +1400,24 @@ retry: iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); goto retry; } - flush = true; } rcu_read_unlock(); - - return flush; } /* * Clear non-leaf entries (and free associated page tables) which could * be replaced by large mappings, for GFNs within the slot. */ -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush) +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) - flush = zap_collapsible_spte_range(kvm, root, slot, flush); - - return flush; + zap_collapsible_spte_range(kvm, root, slot); } /* @@ -1449,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - if (!is_writable_pte(iter.old_spte)) - break; - new_spte = iter.old_spte & ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); + if (new_spte == iter.old_spte) + break; + tdp_mmu_set_spte(kvm, &iter, new_spte); spte_set = true; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 476b133544dd..3899004a5d91 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, |