diff options
Diffstat (limited to 'arch/x86/kvm/mmu/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 461 |
1 files changed, 248 insertions, 213 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e2e2e5cc7dc6..4e03841f053d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -18,6 +18,7 @@ #include "irq.h" #include "ioapic.h" #include "mmu.h" +#include "mmu_internal.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" @@ -91,7 +92,8 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; -static int max_page_level __read_mostly; +static int max_huge_page_level __read_mostly; +static int max_tdp_level __read_mostly; enum { AUDIT_PRE_PAGE_FAULT, @@ -515,6 +517,18 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) +{ + /* Check if guest physical address doesn't exceed guest maximum */ + if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + exception->error_code |= PFERR_RSVD_MASK; + return UNMAPPED_GVA; + } + + return gpa; +} + /* * Sets the shadow PTE masks used by the MMU. * @@ -676,7 +690,7 @@ union split_spte { static void count_spte_clear(u64 *sptep, u64 spte) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (is_shadow_present_pte(spte)) return; @@ -760,7 +774,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) */ static u64 __get_spte_lockless(u64 *sptep) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count; @@ -1060,94 +1074,40 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) -{ - void *obj; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); - if (!obj) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = obj; - } - return 0; -} - -static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) -{ - return cache->nobjs; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) -{ - while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - void *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); - if (!page) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); + /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, + 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) - goto out; - r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + return r; + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, + PT64_ROOT_MAX_LEVEL); if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); -out: - return r; + return r; + if (maybe_indirect) { + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + } + return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache); - mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); + return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1415,10 +1375,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, static bool rmap_can_add(struct kvm_vcpu *vcpu) { - struct kvm_mmu_memory_cache *cache; + struct kvm_mmu_memory_cache *mc; - cache = &vcpu->arch.mmu_pte_list_desc_cache; - return mmu_memory_cache_free_objects(cache); + mc = &vcpu->arch.mmu_pte_list_desc_cache; + return kvm_mmu_memory_cache_nr_free_objects(mc); } static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -1426,7 +1386,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmap_head); @@ -1438,7 +1398,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn_t gfn; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); __pte_list_remove(spte, rmap_head); @@ -1530,7 +1490,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { if (is_large_pte(*sptep)) { - WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K); + WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); --kvm->stat.lpages; return true; @@ -1542,7 +1502,7 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); @@ -1738,21 +1698,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -/** - * kvm_arch_write_log_dirty - emulate dirty page logging - * @vcpu: Guest mode vcpu - * - * Emulate arch specific page modification logging for the - * nested hypervisor - */ -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa) -{ - if (kvm_x86_ops.write_log_dirty) - return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa); - - return 0; -} - bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -2016,7 +1961,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); @@ -2105,10 +2050,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); /* @@ -2138,7 +2083,7 @@ static void mark_unsync(u64 *spte) struct kvm_mmu_page *sp; unsigned int index; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); index = spte - sp->spt; if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; @@ -2207,7 +2152,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = page_header(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2258,15 +2203,14 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); - -#define for_each_valid_sp(_kvm, _sp, _gfn) \ - hlist_for_each_entry(_sp, \ - &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ +#define for_each_valid_sp(_kvm, _sp, _list) \ + hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_valid_sp(_kvm, _sp, _gfn) \ + for_each_valid_sp(_kvm, _sp, \ + &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else static inline bool is_ept_sp(struct kvm_mmu_page *sp) @@ -2464,9 +2408,7 @@ static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) static void clear_sp_write_flooding_count(u64 *spte) { - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - __clear_sp_write_flooding_count(sp); + __clear_sp_write_flooding_count(sptep_to_sp(spte)); } static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, @@ -2476,7 +2418,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int direct, unsigned int access) { + bool direct_mmu = vcpu->arch.mmu->direct_map; union kvm_mmu_page_role role; + struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; bool need_sync = false; @@ -2490,13 +2434,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.gpte_is_8_bytes = true; role.access = access; - if (!vcpu->arch.mmu->direct_map - && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { + if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_valid_sp(vcpu->kvm, sp, gfn) { + + sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + for_each_valid_sp(vcpu->kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2508,6 +2453,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; + if (direct_mmu) + goto trace_get_page; + if (sp->unsync) { /* The page is good, but __kvm_sync_page might still end * up zapping it. If so, break in order to rebuild it. @@ -2523,6 +2471,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); __clear_sp_write_flooding_count(sp); + +trace_get_page: trace_kvm_mmu_get_page(sp, false); goto out; } @@ -2533,8 +2483,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, - &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); + hlist_add_head(&sp->hash_link, sp_list); if (!direct) { /* * we should do write protection before syncing pages @@ -2548,7 +2497,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PG_LEVEL_4K && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); @@ -2657,7 +2605,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2679,7 +2627,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_large_pte(pte)) --kvm->stat.lpages; } else { - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } return true; @@ -2757,10 +2705,23 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, if (!sp->root_count) { /* Count self */ (*nr_zapped)++; - list_move(&sp->link, invalid_list); + + /* + * Already invalid pages (previously active roots) are not on + * the active page list. See list_del() in the "else" case of + * !sp->root_count. + */ + if (sp->role.invalid) + list_add(&sp->link, invalid_list); + else + list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { - list_move(&sp->link, &kvm->arch.active_mmu_pages); + /* + * Remove the active root from the active page list, the root + * will be explicitly freed when the root_count hits zero. + */ + list_del(&sp->link); /* * Obsolete pages cannot be used on any vCPUs, see the comment @@ -2812,33 +2773,60 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } } -static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, - struct list_head *invalid_list) +static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, + unsigned long nr_to_zap) { - struct kvm_mmu_page *sp; + unsigned long total_zapped = 0; + struct kvm_mmu_page *sp, *tmp; + LIST_HEAD(invalid_list); + bool unstable; + int nr_zapped; if (list_empty(&kvm->arch.active_mmu_pages)) - return false; + return 0; + +restart: + list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) { + /* + * Don't zap active root pages, the page itself can't be freed + * and zapping it will just force vCPUs to realloc and reload. + */ + if (sp->root_count) + continue; + + unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, + &nr_zapped); + total_zapped += nr_zapped; + if (total_zapped >= nr_to_zap) + break; + + if (unstable) + goto restart; + } - sp = list_last_entry(&kvm->arch.active_mmu_pages, - struct kvm_mmu_page, link); - return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + kvm->stat.mmu_recycled += total_zapped; + return total_zapped; +} + +static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) +{ + if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; + + return 0; } static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { - LIST_HEAD(invalid_list); + unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) return 0; - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) - break; - - ++vcpu->kvm->stat.mmu_recycled; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; @@ -2851,17 +2839,12 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - LIST_HEAD(invalid_list); - spin_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - /* Need to free some mmu pages to achieve the goal. */ - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) - if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - break; + kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - + goal_nr_mmu_pages); - kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } @@ -2999,7 +2982,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp_ad_disabled(sp)) spte |= SPTE_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) @@ -3102,7 +3085,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *child; u64 pte = *sptep; - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3212,7 +3195,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); /* * Without accessed bits, there's no way to distinguish between @@ -3274,7 +3257,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_page_level); + max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) @@ -3520,7 +3503,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!is_shadow_present_pte(spte)) break; - sp = page_header(__pa(iterator.sptep)); + sp = sptep_to_sp(iterator.sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3607,7 +3590,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); --sp->root_count; if (!sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -3668,7 +3651,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; - if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -3837,7 +3820,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root_hpa; - sp = page_header(root); + sp = to_shadow_page(root); /* * Even if another CPU was marking the SP as unsync-ed @@ -3871,7 +3854,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); + sp = to_shadow_page(root); mmu_sync_children(vcpu, sp); } } @@ -4045,8 +4028,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) { struct kvm_arch_async_pf arch; @@ -4108,16 +4091,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - r = mmu_topup_memory_caches(vcpu); + if (fast_page_fault(vcpu, gpa, error_code)) + return RET_PF_RETRY; + + r = mmu_topup_memory_caches(vcpu, false); if (r) return r; if (lpage_disallowed) max_level = PG_LEVEL_4K; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4131,7 +4114,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, prefault, is_tdp && lpage_disallowed); @@ -4156,6 +4140,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; + u32 flags = vcpu->arch.apf.host_apf_flags; #ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ @@ -4164,28 +4149,22 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, #endif vcpu->arch.l1tf_flush_l1d = true; - switch (vcpu->arch.apf.host_apf_flags) { - default: + if (!flags) { trace_kvm_page_fault(fault_address, error_code); if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: + } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - vcpu->arch.apf.host_apf_flags = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; + } else { + WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); } + return r; } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); @@ -4227,8 +4206,8 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && page_header(root->hpa) && - role.word == page_header(root->hpa)->role.word; + VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && + role.word == to_shadow_page(root->hpa)->role.word; } /* @@ -4277,8 +4256,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && mmu->root_level >= PT64_ROOT_4LEVEL) - return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) && - cached_root_available(vcpu, new_pgd, new_role); + return cached_root_available(vcpu, new_pgd, new_role); return false; } @@ -4313,7 +4291,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); + __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, @@ -4869,13 +4847,22 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, return role; } +static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) +{ + /* Use 5-level TDP if and only if it's useful/necessary. */ + if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + return 4; + + return max_tdp_level; +} + static union kvm_mmu_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = vcpu->arch.tdp_level; + role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4884,7 +4871,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = kvm_calc_tdp_mmu_root_page_role(vcpu, false); @@ -4896,7 +4883,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->sync_page = nonpaging_sync_page; context->invlpg = NULL; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = vcpu->arch.tdp_level; + context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; @@ -4931,7 +4918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); @@ -4939,9 +4926,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) !is_write_protection(vcpu); role.base.smap_andnot_wp = role.ext.cr4_smap && !is_write_protection(vcpu); - role.base.direct = !is_paging(vcpu); role.base.gpte_is_8_bytes = !!is_pae(vcpu); + return role; +} + +static union kvm_mmu_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, base_only); + + role.base.direct = !is_paging(vcpu); + if (!is_long_mode(vcpu)) role.base.level = PT32E_ROOT_LEVEL; else if (is_la57_mode(vcpu)) @@ -4952,15 +4949,10 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) return role; } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + u32 cr0, u32 cr4, u32 efer, + union kvm_mmu_role new_role) { - struct kvm_mmu *context = vcpu->arch.mmu; - union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); - - if (new_role.as_u64 == context->mmu_role.as_u64) - return; - if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); else if (efer & EFER_LMA) @@ -4973,7 +4965,43 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +{ + struct kvm_mmu *context = &vcpu->arch.root_mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_mmu_root_page_role(vcpu, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); +} + +static union kvm_mmu_role +kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, false); + + role.base.direct = false; + role.base.level = kvm_mmu_get_tdp_level(vcpu); + + return role; +} + +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, + gpa_t nested_cr3) +{ + struct kvm_mmu *context = &vcpu->arch.guest_mmu; + union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); + + context->shadow_root_level = new_role.base.level; + + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); static union kvm_mmu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, @@ -5007,7 +5035,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_mmu_role new_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, @@ -5041,7 +5069,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, kvm_read_cr0_bits(vcpu, X86_CR0_PG), @@ -5151,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - r = mmu_topup_memory_caches(vcpu); + r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; r = mmu_alloc_roots(vcpu); @@ -5345,7 +5373,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * or not since pte prefetch is skiped if it does not have * enough objects in the cache. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); spin_lock(&vcpu->kvm->mmu_lock); @@ -5553,23 +5581,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level) { tdp_enabled = enable_tdp; + max_tdp_level = tdp_max_root_level; /* - * max_page_level reflects the capabilities of KVM's MMU irrespective + * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). */ if (tdp_enabled) - max_page_level = tdp_page_level; + max_huge_page_level = tdp_huge_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) - max_page_level = PG_LEVEL_1G; + max_huge_page_level = PG_LEVEL_1G; else - max_page_level = PG_LEVEL_2M; + max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); @@ -5665,7 +5695,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL) + if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); @@ -5684,6 +5714,14 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) uint i; int ret; + vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; + vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; + + vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; + vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; + + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -5732,12 +5770,11 @@ restart: break; /* - * Skip invalid pages with a non-zero root count, zapping pages - * with a non-zero root count will never succeed, i.e. the page - * will get thrown back on active_mmu_pages and we'll get stuck - * in an infinite loop. + * Invalid pages should never land back on the list of active + * pages. Skip the bogus page, otherwise we'll get stuck in an + * infinite loop if the page gets put back on the list (again). */ - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; /* @@ -5904,7 +5941,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); pfn = spte_to_pfn(*sptep); /* @@ -6015,7 +6052,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; @@ -6092,9 +6129,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) goto unlock; } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - freed++; - kvm_mmu_commit_zap_page(kvm, &invalid_list); + freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: spin_unlock(&kvm->mmu_lock); |