diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 205 |
1 files changed, 120 insertions, 85 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1e9ba81accba..9a5814d8d194 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * @@ -12,10 +13,6 @@ * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include "irq.h" @@ -143,9 +140,6 @@ module_param(dbg, bool, 0644); #include <trace/events/kvm.h> -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) @@ -262,11 +256,20 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; */ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +static u8 __read_mostly shadow_phys_bits; static void mmu_spte_set(u64 *sptep, u64 spte); +static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + static inline bool kvm_available_flush_tlb_with_range(void) { @@ -471,6 +474,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected + * in CPU detection code, but MKTME treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore for MKTME + * we should still return physical address bits reported by CPUID. + */ + if (!boot_cpu_has(X86_FEATURE_TME) || + WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) + return boot_cpu_data.x86_phys_bits; + + return cpuid_eax(0x80000008) & 0xff; +} + static void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; @@ -484,6 +502,8 @@ static void kvm_mmu_reset_all_pte_masks(void) shadow_present_mask = 0; shadow_acc_track_mask = 0; + shadow_phys_bits = kvm_get_shadow_phys_bits(); + /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is @@ -653,7 +673,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* * The idea using the light way get the spte on x86_32 guest is from - * gup_get_pte(arch/x86/mm/gup.c). + * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because kvm_set_pte_rmapp * coalesces them and we are running out of the MMU lock. Therefore @@ -1076,10 +1096,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) { - if (sp->role.direct) - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); - else + if (!sp->role.direct) { sp->gfns[index] = gfn; + return; + } + + if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) + pr_err_ratelimited("gfn mismatch under direct page %llx " + "(expected %llx, got %llx)\n", + sp->gfn, + kvm_mmu_page_get_gfn(sp, index), gfn); } /* @@ -3058,10 +3084,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", - is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, - *sptep, sptep); + trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -3073,8 +3096,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, } } - kvm_release_pfn_clean(pfn); - return ret; } @@ -3109,9 +3130,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (ret <= 0) return -1; - for (i = 0; i < ret; i++, gfn++, start++) + for (i = 0; i < ret; i++, gfn++, start++) { mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); + put_page(pages[i]); + } return 0; } @@ -3159,40 +3182,40 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + int map_writable, int level, kvm_pfn_t pfn, + bool prefault) { - struct kvm_shadow_walk_iterator iterator; + struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int emulate = 0; - gfn_t pseudo_gfn; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + gfn_t base_gfn = gfn; if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return 0; + return RET_PF_RETRY; - for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == level) { - emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, level, gfn, pfn, prefault, - map_writable); - direct_pte_prefetch(vcpu, iterator.sptep); - ++vcpu->stat.pf_fixed; + trace_kvm_mmu_spte_requested(gpa, level, pfn); + for_each_shadow_entry(vcpu, gpa, it) { + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == level) break; - } - - drop_large_spte(vcpu, iterator.sptep); - if (!is_shadow_present_pte(*iterator.sptep)) { - u64 base_addr = iterator.addr; - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); - pseudo_gfn = base_addr >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, 1, ACC_ALL); + drop_large_spte(vcpu, it.sptep); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); - link_shadow_page(vcpu, iterator.sptep, sp); + link_shadow_page(vcpu, it.sptep, sp); } } - return emulate; + + ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, + write, level, base_gfn, pfn, prefault, + map_writable); + direct_pte_prefetch(vcpu, it.sptep); + ++vcpu->stat.pf_fixed; + return ret; } static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) @@ -3219,11 +3242,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, kvm_pfn_t *pfnp, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; int level = *levelp; /* @@ -3250,8 +3272,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; kvm_release_pfn_clean(pfn); pfn &= ~mask; kvm_get_pfn(pfn); @@ -3508,22 +3528,19 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, @@ -4018,19 +4035,6 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) -{ - if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu) || - vcpu->arch.exception.pending)) - return false; - - if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) - return false; - - return kvm_x86_ops->interrupt_allowed(vcpu); -} - static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) { @@ -4150,22 +4154,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -4497,7 +4498,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4534,13 +4535,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, false); if (!shadow_me_mask) @@ -4561,7 +4562,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - boot_cpu_data.x86_phys_bits, execonly); + shadow_phys_bits, execonly); } #define BYTE_MASK(access) \ @@ -5602,14 +5603,18 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; - if (tdp_enabled) - return 0; - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. + * When using PAE paging, the four PDPTEs are treated as 'root' pages, + * while the PDP table is a per-vCPU construct that's allocated at MMU + * creation. When emulating 32-bit mode, cr3 is only 32 bits even on + * x86_64. Therefore we need to allocate the PDP table in the first + * 4GB of memory, which happens to fit the DMA32 zone. Except for + * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can + * skip allocating the PDP table. */ + if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) + return 0; + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; @@ -5934,7 +5939,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -5976,7 +5981,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return freed; } @@ -5998,6 +6003,34 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static void kvm_set_mmio_spte_mask(void) +{ + u64 mask; + + /* + * Set the reserved bits and the present bit of an paging-structure + * entry to generate page fault with PFER.RSV = 1. + */ + + /* + * Mask the uppermost physical address bit, which would be reserved as + * long as the supported physical address width is less than 52. + */ + mask = 1ull << 51; + + /* Set the present bit. */ + mask |= 1ull; + + /* + * If reserved bit is not supported, clear the present bit to disable + * mmio page fault. + */ + if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) + mask &= ~1ull; + + kvm_mmu_set_mmio_spte_mask(mask, mask); +} + int kvm_mmu_module_init(void) { int ret = -ENOMEM; @@ -6014,6 +6047,8 @@ int kvm_mmu_module_init(void) kvm_mmu_reset_all_pte_masks(); + kvm_set_mmio_spte_mask(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); |