diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 461 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_audit.c | 303 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 63 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmutrace.h | 395 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 19 |
6 files changed, 1020 insertions, 223 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e2e2e5cc7dc6..4e03841f053d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -18,6 +18,7 @@ #include "irq.h" #include "ioapic.h" #include "mmu.h" +#include "mmu_internal.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" @@ -91,7 +92,8 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; -static int max_page_level __read_mostly; +static int max_huge_page_level __read_mostly; +static int max_tdp_level __read_mostly; enum { AUDIT_PRE_PAGE_FAULT, @@ -515,6 +517,18 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) +{ + /* Check if guest physical address doesn't exceed guest maximum */ + if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + exception->error_code |= PFERR_RSVD_MASK; + return UNMAPPED_GVA; + } + + return gpa; +} + /* * Sets the shadow PTE masks used by the MMU. * @@ -676,7 +690,7 @@ union split_spte { static void count_spte_clear(u64 *sptep, u64 spte) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (is_shadow_present_pte(spte)) return; @@ -760,7 +774,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) */ static u64 __get_spte_lockless(u64 *sptep) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count; @@ -1060,94 +1074,40 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) -{ - void *obj; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); - if (!obj) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = obj; - } - return 0; -} - -static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) -{ - return cache->nobjs; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) -{ - while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - void *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); - if (!page) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); + /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, + 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) - goto out; - r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + return r; + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, + PT64_ROOT_MAX_LEVEL); if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); -out: - return r; + return r; + if (maybe_indirect) { + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + } + return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache); - mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); + return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1415,10 +1375,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, static bool rmap_can_add(struct kvm_vcpu *vcpu) { - struct kvm_mmu_memory_cache *cache; + struct kvm_mmu_memory_cache *mc; - cache = &vcpu->arch.mmu_pte_list_desc_cache; - return mmu_memory_cache_free_objects(cache); + mc = &vcpu->arch.mmu_pte_list_desc_cache; + return kvm_mmu_memory_cache_nr_free_objects(mc); } static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -1426,7 +1386,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmap_head); @@ -1438,7 +1398,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn_t gfn; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); __pte_list_remove(spte, rmap_head); @@ -1530,7 +1490,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { if (is_large_pte(*sptep)) { - WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K); + WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); --kvm->stat.lpages; return true; @@ -1542,7 +1502,7 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); @@ -1738,21 +1698,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -/** - * kvm_arch_write_log_dirty - emulate dirty page logging - * @vcpu: Guest mode vcpu - * - * Emulate arch specific page modification logging for the - * nested hypervisor - */ -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa) -{ - if (kvm_x86_ops.write_log_dirty) - return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa); - - return 0; -} - bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -2016,7 +1961,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); @@ -2105,10 +2050,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); /* @@ -2138,7 +2083,7 @@ static void mark_unsync(u64 *spte) struct kvm_mmu_page *sp; unsigned int index; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); index = spte - sp->spt; if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; @@ -2207,7 +2152,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = page_header(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2258,15 +2203,14 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); - -#define for_each_valid_sp(_kvm, _sp, _gfn) \ - hlist_for_each_entry(_sp, \ - &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ +#define for_each_valid_sp(_kvm, _sp, _list) \ + hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_valid_sp(_kvm, _sp, _gfn) \ + for_each_valid_sp(_kvm, _sp, \ + &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else static inline bool is_ept_sp(struct kvm_mmu_page *sp) @@ -2464,9 +2408,7 @@ static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) static void clear_sp_write_flooding_count(u64 *spte) { - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - __clear_sp_write_flooding_count(sp); + __clear_sp_write_flooding_count(sptep_to_sp(spte)); } static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, @@ -2476,7 +2418,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int direct, unsigned int access) { + bool direct_mmu = vcpu->arch.mmu->direct_map; union kvm_mmu_page_role role; + struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; bool need_sync = false; @@ -2490,13 +2434,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.gpte_is_8_bytes = true; role.access = access; - if (!vcpu->arch.mmu->direct_map - && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { + if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_valid_sp(vcpu->kvm, sp, gfn) { + + sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + for_each_valid_sp(vcpu->kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2508,6 +2453,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; + if (direct_mmu) + goto trace_get_page; + if (sp->unsync) { /* The page is good, but __kvm_sync_page might still end * up zapping it. If so, break in order to rebuild it. @@ -2523,6 +2471,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); __clear_sp_write_flooding_count(sp); + +trace_get_page: trace_kvm_mmu_get_page(sp, false); goto out; } @@ -2533,8 +2483,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, - &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); + hlist_add_head(&sp->hash_link, sp_list); if (!direct) { /* * we should do write protection before syncing pages @@ -2548,7 +2497,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PG_LEVEL_4K && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); @@ -2657,7 +2605,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2679,7 +2627,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_large_pte(pte)) --kvm->stat.lpages; } else { - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } return true; @@ -2757,10 +2705,23 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, if (!sp->root_count) { /* Count self */ (*nr_zapped)++; - list_move(&sp->link, invalid_list); + + /* + * Already invalid pages (previously active roots) are not on + * the active page list. See list_del() in the "else" case of + * !sp->root_count. + */ + if (sp->role.invalid) + list_add(&sp->link, invalid_list); + else + list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { - list_move(&sp->link, &kvm->arch.active_mmu_pages); + /* + * Remove the active root from the active page list, the root + * will be explicitly freed when the root_count hits zero. + */ + list_del(&sp->link); /* * Obsolete pages cannot be used on any vCPUs, see the comment @@ -2812,33 +2773,60 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } } -static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, - struct list_head *invalid_list) +static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, + unsigned long nr_to_zap) { - struct kvm_mmu_page *sp; + unsigned long total_zapped = 0; + struct kvm_mmu_page *sp, *tmp; + LIST_HEAD(invalid_list); + bool unstable; + int nr_zapped; if (list_empty(&kvm->arch.active_mmu_pages)) - return false; + return 0; + +restart: + list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) { + /* + * Don't zap active root pages, the page itself can't be freed + * and zapping it will just force vCPUs to realloc and reload. + */ + if (sp->root_count) + continue; + + unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, + &nr_zapped); + total_zapped += nr_zapped; + if (total_zapped >= nr_to_zap) + break; + + if (unstable) + goto restart; + } - sp = list_last_entry(&kvm->arch.active_mmu_pages, - struct kvm_mmu_page, link); - return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + kvm->stat.mmu_recycled += total_zapped; + return total_zapped; +} + +static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) +{ + if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; + + return 0; } static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { - LIST_HEAD(invalid_list); + unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) return 0; - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) - break; - - ++vcpu->kvm->stat.mmu_recycled; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; @@ -2851,17 +2839,12 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - LIST_HEAD(invalid_list); - spin_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - /* Need to free some mmu pages to achieve the goal. */ - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) - if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - break; + kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - + goal_nr_mmu_pages); - kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } @@ -2999,7 +2982,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp_ad_disabled(sp)) spte |= SPTE_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) @@ -3102,7 +3085,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *child; u64 pte = *sptep; - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3212,7 +3195,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); /* * Without accessed bits, there's no way to distinguish between @@ -3274,7 +3257,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_page_level); + max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) @@ -3520,7 +3503,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!is_shadow_present_pte(spte)) break; - sp = page_header(__pa(iterator.sptep)); + sp = sptep_to_sp(iterator.sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3607,7 +3590,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); --sp->root_count; if (!sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -3668,7 +3651,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; - if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -3837,7 +3820,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root_hpa; - sp = page_header(root); + sp = to_shadow_page(root); /* * Even if another CPU was marking the SP as unsync-ed @@ -3871,7 +3854,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); + sp = to_shadow_page(root); mmu_sync_children(vcpu, sp); } } @@ -4045,8 +4028,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) { struct kvm_arch_async_pf arch; @@ -4108,16 +4091,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - r = mmu_topup_memory_caches(vcpu); + if (fast_page_fault(vcpu, gpa, error_code)) + return RET_PF_RETRY; + + r = mmu_topup_memory_caches(vcpu, false); if (r) return r; if (lpage_disallowed) max_level = PG_LEVEL_4K; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4131,7 +4114,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, prefault, is_tdp && lpage_disallowed); @@ -4156,6 +4140,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; + u32 flags = vcpu->arch.apf.host_apf_flags; #ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ @@ -4164,28 +4149,22 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, #endif vcpu->arch.l1tf_flush_l1d = true; - switch (vcpu->arch.apf.host_apf_flags) { - default: + if (!flags) { trace_kvm_page_fault(fault_address, error_code); if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: + } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - vcpu->arch.apf.host_apf_flags = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; + } else { + WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); } + return r; } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); @@ -4227,8 +4206,8 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && page_header(root->hpa) && - role.word == page_header(root->hpa)->role.word; + VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && + role.word == to_shadow_page(root->hpa)->role.word; } /* @@ -4277,8 +4256,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && mmu->root_level >= PT64_ROOT_4LEVEL) - return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) && - cached_root_available(vcpu, new_pgd, new_role); + return cached_root_available(vcpu, new_pgd, new_role); return false; } @@ -4313,7 +4291,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); + __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, @@ -4869,13 +4847,22 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, return role; } +static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) +{ + /* Use 5-level TDP if and only if it's useful/necessary. */ + if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + return 4; + + return max_tdp_level; +} + static union kvm_mmu_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = vcpu->arch.tdp_level; + role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4884,7 +4871,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = kvm_calc_tdp_mmu_root_page_role(vcpu, false); @@ -4896,7 +4883,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->sync_page = nonpaging_sync_page; context->invlpg = NULL; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = vcpu->arch.tdp_level; + context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; @@ -4931,7 +4918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); @@ -4939,9 +4926,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) !is_write_protection(vcpu); role.base.smap_andnot_wp = role.ext.cr4_smap && !is_write_protection(vcpu); - role.base.direct = !is_paging(vcpu); role.base.gpte_is_8_bytes = !!is_pae(vcpu); + return role; +} + +static union kvm_mmu_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, base_only); + + role.base.direct = !is_paging(vcpu); + if (!is_long_mode(vcpu)) role.base.level = PT32E_ROOT_LEVEL; else if (is_la57_mode(vcpu)) @@ -4952,15 +4949,10 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) return role; } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + u32 cr0, u32 cr4, u32 efer, + union kvm_mmu_role new_role) { - struct kvm_mmu *context = vcpu->arch.mmu; - union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); - - if (new_role.as_u64 == context->mmu_role.as_u64) - return; - if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); else if (efer & EFER_LMA) @@ -4973,7 +4965,43 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +{ + struct kvm_mmu *context = &vcpu->arch.root_mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_mmu_root_page_role(vcpu, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); +} + +static union kvm_mmu_role +kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, false); + + role.base.direct = false; + role.base.level = kvm_mmu_get_tdp_level(vcpu); + + return role; +} + +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, + gpa_t nested_cr3) +{ + struct kvm_mmu *context = &vcpu->arch.guest_mmu; + union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); + + context->shadow_root_level = new_role.base.level; + + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); static union kvm_mmu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, @@ -5007,7 +5035,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_mmu_role new_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, @@ -5041,7 +5069,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, kvm_read_cr0_bits(vcpu, X86_CR0_PG), @@ -5151,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - r = mmu_topup_memory_caches(vcpu); + r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; r = mmu_alloc_roots(vcpu); @@ -5345,7 +5373,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * or not since pte prefetch is skiped if it does not have * enough objects in the cache. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); spin_lock(&vcpu->kvm->mmu_lock); @@ -5553,23 +5581,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level) { tdp_enabled = enable_tdp; + max_tdp_level = tdp_max_root_level; /* - * max_page_level reflects the capabilities of KVM's MMU irrespective + * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). */ if (tdp_enabled) - max_page_level = tdp_page_level; + max_huge_page_level = tdp_huge_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) - max_page_level = PG_LEVEL_1G; + max_huge_page_level = PG_LEVEL_1G; else - max_page_level = PG_LEVEL_2M; + max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); @@ -5665,7 +5695,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL) + if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); @@ -5684,6 +5714,14 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) uint i; int ret; + vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; + vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; + + vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; + vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; + + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -5732,12 +5770,11 @@ restart: break; /* - * Skip invalid pages with a non-zero root count, zapping pages - * with a non-zero root count will never succeed, i.e. the page - * will get thrown back on active_mmu_pages and we'll get stuck - * in an infinite loop. + * Invalid pages should never land back on the list of active + * pages. Skip the bogus page, otherwise we'll get stuck in an + * infinite loop if the page gets put back on the list (again). */ - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; /* @@ -5904,7 +5941,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); pfn = spte_to_pfn(*sptep); /* @@ -6015,7 +6052,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; @@ -6092,9 +6129,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) goto unlock; } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - freed++; - kvm_mmu_commit_zap_page(kvm, &invalid_list); + freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: spin_unlock(&kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c new file mode 100644 index 000000000000..c8d51a37e2ce --- /dev/null +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mmu_audit.c: + * + * Audit code for KVM MMU + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * Marcelo Tosatti <mtosatti@redhat.com> + * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> + */ + +#include <linux/ratelimit.h> + +static char const *audit_point_name[] = { + "pre page fault", + "post page fault", + "pre pte write", + "post pte write", + "pre sync", + "post sync" +}; + +#define audit_printk(kvm, fmt, args...) \ + printk(KERN_ERR "audit: (%s) error: " \ + fmt, audit_point_name[kvm->arch.audit_point], ##args) + +typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); + +static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + inspect_spte_fn fn, int level) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 *ent = sp->spt; + + fn(vcpu, ent + i, level); + + if (is_shadow_present_pte(ent[i]) && + !is_last_spte(ent[i], level)) { + struct kvm_mmu_page *child; + + child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(vcpu, child, fn, level - 1); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + return; + + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; + + sp = to_shadow_page(root); + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); + return; + } + + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu->pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = to_shadow_page(root); + __mmu_spte_walk(vcpu, sp, fn, 2); + } + } + + return; +} + +typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); + +static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) + fn(kvm, sp); +} + +static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp; + gfn_t gfn; + kvm_pfn_t pfn; + hpa_t hpa; + + sp = sptep_to_sp(sptep); + + if (sp->unsync) { + if (level != PG_LEVEL_4K) { + audit_printk(vcpu->kvm, "unsync sp: %p " + "level = %d\n", sp, level); + return; + } + } + + if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) + return; + + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn); + + if (is_error_pfn(pfn)) + return; + + hpa = pfn << PAGE_SHIFT; + if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) + audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " + "ent %llxn", vcpu->arch.mmu->root_level, pfn, + hpa, *sptep); +} + +static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + struct kvm_rmap_head *rmap_head; + struct kvm_mmu_page *rev_sp; + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + gfn_t gfn; + + rev_sp = sptep_to_sp(sptep); + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); + + slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); + slot = __gfn_to_memslot(slots, gfn); + if (!slot) { + if (!__ratelimit(&ratelimit_state)) + return; + audit_printk(kvm, "no memslot for gfn %llx\n", gfn); + audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", + (long int)(sptep - rev_sp->spt), rev_sp->gfn); + dump_stack(); + return; + } + + rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + if (!rmap_head->val) { + if (!__ratelimit(&ratelimit_state)) + return; + audit_printk(kvm, "no rmap for writable spte %llx\n", + *sptep); + dump_stack(); + } +} + +static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) + inspect_spte_has_rmap(vcpu->kvm, sptep); +} + +static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + + if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) + audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " + "root.\n", sp); +} + +static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + int i; + + if (sp->role.level != PG_LEVEL_4K) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_shadow_present_pte(sp->spt[i])) + continue; + + inspect_spte_has_rmap(kvm, sp->spt + i); + } +} + +static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct kvm_rmap_head *rmap_head; + u64 *sptep; + struct rmap_iterator iter; + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + + if (sp->role.direct || sp->unsync || sp->role.invalid) + return; + + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, sp->gfn); + rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); + + for_each_rmap_spte(rmap_head, &iter, sptep) { + if (is_writable_pte(*sptep)) + audit_printk(kvm, "shadow page has writable " + "mappings: gfn %llx role %x\n", + sp->gfn, sp->role.word); + } +} + +static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + check_mappings_rmap(kvm, sp); + audit_write_protection(kvm, sp); +} + +static void audit_all_active_sps(struct kvm *kvm) +{ + walk_all_active_sps(kvm, audit_sp); +} + +static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + audit_sptes_have_rmaps(vcpu, sptep, level); + audit_mappings(vcpu, sptep, level); + audit_spte_after_sync(vcpu, sptep, level); +} + +static void audit_vcpu_spte(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, audit_spte); +} + +static bool mmu_audit; +static struct static_key mmu_audit_key; + +static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!__ratelimit(&ratelimit_state)) + return; + + vcpu->kvm->arch.audit_point = point; + audit_all_active_sps(vcpu->kvm); + audit_vcpu_spte(vcpu); +} + +static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) +{ + if (static_key_false((&mmu_audit_key))) + __kvm_mmu_audit(vcpu, point); +} + +static void mmu_audit_enable(void) +{ + if (mmu_audit) + return; + + static_key_slow_inc(&mmu_audit_key); + mmu_audit = true; +} + +static void mmu_audit_disable(void) +{ + if (!mmu_audit) + return; + + static_key_slow_dec(&mmu_audit_key); + mmu_audit = false; +} + +static int mmu_audit_set(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned long enable; + + ret = kstrtoul(val, 10, &enable); + if (ret < 0) + return -EINVAL; + + switch (enable) { + case 0: + mmu_audit_disable(); + break; + case 1: + mmu_audit_enable(); + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct kernel_param_ops audit_param_ops = { + .set = mmu_audit_set, + .get = param_get_bool, +}; + +arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h new file mode 100644 index 000000000000..3acf3b8eb469 --- /dev/null +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_MMU_INTERNAL_H +#define __KVM_X86_MMU_INTERNAL_H + +#include <linux/types.h> + +#include <asm/kvm_host.h> + +struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; + struct list_head lpage_disallowed_link; + + bool unsync; + u8 mmu_valid_gen; + bool mmio_cached; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + + /* + * The following two entries are used to key the shadow page in the + * hash table. + */ + union kvm_mmu_page_role role; + gfn_t gfn; + + u64 *spt; + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; + int root_count; /* Currently serving as active root */ + unsigned int unsync_children; + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ + DECLARE_BITMAP(unsync_child_bitmap, 512); + +#ifdef CONFIG_X86_32 + /* + * Used out of the mmu-lock to avoid reading spte values while an + * update is in progress; see the comments in __get_spte_lockless(). + */ + int clear_spte_count; +#endif + + /* Number of writes since the last time traversal visited this page. */ + atomic_t write_flooding_count; +}; + +static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) +{ + struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +{ + return to_shadow_page(__pa(sptep)); +} + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); + +#endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h new file mode 100644 index 000000000000..9d15bc0c535b --- /dev/null +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -0,0 +1,395 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVMMMU_H + +#include <linux/tracepoint.h> +#include <linux/trace_events.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvmmmu + +#define KVM_MMU_PAGE_FIELDS \ + __field(__u8, mmu_valid_gen) \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ + __field(bool, unsync) + +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->mmu_valid_gen = sp->mmu_valid_gen; \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ + __entry->unsync = sp->unsync; + +#define KVM_MMU_PAGE_PRINTK() ({ \ + const char *saved_ptr = trace_seq_buffer_ptr(p); \ + static const char *access_str[] = { \ + "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ + }; \ + union kvm_mmu_page_role role; \ + \ + role.word = __entry->role; \ + \ + trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \ + " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ + __entry->gfn, role.level, \ + role.gpte_is_8_bytes ? 8 : 4, \ + role.quadrant, \ + role.direct ? " direct" : "", \ + access_str[role.access], \ + role.invalid ? " invalid" : "", \ + role.nxe ? "" : "!", \ + role.ad_disabled ? "!" : "", \ + __entry->root_count, \ + __entry->unsync ? "unsync" : "sync", 0); \ + saved_ptr; \ + }) + +#define kvm_mmu_trace_pferr_flags \ + { PFERR_PRESENT_MASK, "P" }, \ + { PFERR_WRITE_MASK, "W" }, \ + { PFERR_USER_MASK, "U" }, \ + { PFERR_RSVD_MASK, "RSVD" }, \ + { PFERR_FETCH_MASK, "F" } + +/* + * A pagetable walk has started + */ +TRACE_EVENT( + kvm_mmu_pagetable_walk, + TP_PROTO(u64 addr, u32 pferr), + TP_ARGS(addr, pferr), + + TP_STRUCT__entry( + __field(__u64, addr) + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pferr = pferr; + ), + + TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + + +/* We just walked a paging element */ +TRACE_EVENT( + kvm_mmu_paging_element, + TP_PROTO(u64 pte, int level), + TP_ARGS(pte, level), + + TP_STRUCT__entry( + __field(__u64, pte) + __field(__u32, level) + ), + + TP_fast_assign( + __entry->pte = pte; + __entry->level = level; + ), + + TP_printk("pte %llx level %u", __entry->pte, __entry->level) +); + +DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, + + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + + TP_ARGS(table_gfn, index, size), + + TP_STRUCT__entry( + __field(__u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), + + TP_printk("gpa %llx", __entry->gpa) +); + +/* We set a pte accessed bit */ +DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit, + + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + + TP_ARGS(table_gfn, index, size) +); + +/* We set a pte dirty bit */ +DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit, + + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + + TP_ARGS(table_gfn, index, size) +); + +TRACE_EVENT( + kvm_mmu_walker_error, + TP_PROTO(u32 pferr), + TP_ARGS(pferr), + + TP_STRUCT__entry( + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->pferr = pferr; + ), + + TP_printk("pferr %x %s", __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + +TRACE_EVENT( + kvm_mmu_get_page, + TP_PROTO(struct kvm_mmu_page *sp, bool created), + TP_ARGS(sp, created), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + __field(bool, created) + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + __entry->created = created; + ), + + TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), + __entry->created ? "new" : "existing") +); + +DECLARE_EVENT_CLASS(kvm_mmu_page_class, + + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page, + TP_PROTO(struct kvm_mmu_page *sp), + + TP_ARGS(sp) +); + +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, + TP_PROTO(struct kvm_mmu_page *sp), + + TP_ARGS(sp) +); + +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, + TP_PROTO(struct kvm_mmu_page *sp), + + TP_ARGS(sp) +); + +TRACE_EVENT( + mark_mmio_spte, + TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), + TP_ARGS(sptep, gfn, access, gen), + + TP_STRUCT__entry( + __field(void *, sptep) + __field(gfn_t, gfn) + __field(unsigned, access) + __field(unsigned int, gen) + ), + + TP_fast_assign( + __entry->sptep = sptep; + __entry->gfn = gfn; + __entry->access = access; + __entry->gen = gen; + ), + + TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, + __entry->gfn, __entry->access, __entry->gen) +); + +TRACE_EVENT( + handle_mmio_page_fault, + TP_PROTO(u64 addr, gfn_t gfn, unsigned access), + TP_ARGS(addr, gfn, access), + + TP_STRUCT__entry( + __field(u64, addr) + __field(gfn_t, gfn) + __field(unsigned, access) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->gfn = gfn; + __entry->access = access; + ), + + TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, + __entry->access) +); + +#define __spte_satisfied(__spte) \ + (__entry->retry && is_writable_pte(__entry->__spte)) + +TRACE_EVENT( + fast_page_fault, + TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, + u64 *sptep, u64 old_spte, bool retry), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gpa_t, cr2_or_gpa) + __field(u32, error_code) + __field(u64 *, sptep) + __field(u64, old_spte) + __field(u64, new_spte) + __field(bool, retry) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->cr2_or_gpa = cr2_or_gpa; + __entry->error_code = error_code; + __entry->sptep = sptep; + __entry->old_spte = old_spte; + __entry->new_spte = *sptep; + __entry->retry = retry; + ), + + TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" + " new %llx spurious %d fixed %d", __entry->vcpu_id, + __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", + kvm_mmu_trace_pferr_flags), __entry->sptep, + __entry->old_spte, __entry->new_spte, + __spte_satisfied(old_spte), __spte_satisfied(new_spte) + ) +); + +TRACE_EVENT( + kvm_mmu_zap_all_fast, + TP_PROTO(struct kvm *kvm), + TP_ARGS(kvm), + + TP_STRUCT__entry( + __field(__u8, mmu_valid_gen) + __field(unsigned int, mmu_used_pages) + ), + + TP_fast_assign( + __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; + __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; + ), + + TP_printk("kvm-mmu-valid-gen %u used_pages %x", + __entry->mmu_valid_gen, __entry->mmu_used_pages + ) +); + + +TRACE_EVENT( + check_mmio_spte, + TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), + TP_ARGS(spte, kvm_gen, spte_gen), + + TP_STRUCT__entry( + __field(unsigned int, kvm_gen) + __field(unsigned int, spte_gen) + __field(u64, spte) + ), + + TP_fast_assign( + __entry->kvm_gen = kvm_gen; + __entry->spte_gen = spte_gen; + __entry->spte = spte; + ), + + TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte, + __entry->kvm_gen, __entry->spte_gen, + __entry->kvm_gen == __entry->spte_gen + ) +); + +TRACE_EVENT( + kvm_mmu_set_spte, + TP_PROTO(int level, gfn_t gfn, u64 *sptep), + TP_ARGS(level, gfn, sptep), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, spte) + __field(u64, sptep) + __field(u8, level) + /* These depend on page entry type, so compute them now. */ + __field(bool, r) + __field(bool, x) + __field(signed char, u) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->spte = *sptep; + __entry->sptep = virt_to_phys(sptep); + __entry->level = level; + __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); + __entry->x = is_executable_pte(__entry->spte); + __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; + ), + + TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", + __entry->gfn, __entry->spte, + __entry->r ? "r" : "-", + __entry->spte & PT_WRITABLE_MASK ? "w" : "-", + __entry->x ? "x" : "-", + __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), + __entry->level, __entry->sptep + ) +); + +TRACE_EVENT( + kvm_mmu_spte_requested, + TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), + TP_ARGS(addr, level, pfn), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, pfn) + __field(u8, level) + ), + + TP_fast_assign( + __entry->gfn = addr >> PAGE_SHIFT; + __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + __entry->level = level; + ), + + TP_printk("gfn %llx pfn %llx level %d", + __entry->gfn, __entry->pfn, __entry->level + ) +); + +#endif /* _TRACE_KVMMMU_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH mmu +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE mmutrace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index a7bcde34d1f2..a84a141a2ad2 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,7 +16,7 @@ #include <asm/kvm_page_track.h> -#include "mmu.h" +#include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 275564a0ebdb..4dd6b1e5b8cf 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -260,7 +260,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_arch_write_log_dirty(vcpu, addr)) + if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; @@ -596,7 +596,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, u64 *spte; int i; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp->role.level > PG_LEVEL_4K) return; @@ -789,10 +789,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - /* * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. @@ -820,6 +816,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, return RET_PF_EMULATE; } + r = mmu_topup_memory_caches(vcpu, true); + if (r) + return r; + vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, @@ -866,7 +866,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, map_writable, prefault, lpage_disallowed); @@ -903,7 +904,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) * No need to check return value here, rmap_can_add() can * help us to skip pte prefetch later. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); if (!VALID_PAGE(root_hpa)) { WARN_ON(1); @@ -915,7 +916,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) level = iterator.level; sptep = iterator.sptep; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (is_last_spte(*sptep, level)) { pt_element_t gpte; gpa_t pte_gpa; |