diff options
Diffstat (limited to 'arch/x86/kvm/mmu/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 176 |
1 files changed, 125 insertions, 51 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b6f96d47e596..835426254e76 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -22,6 +22,7 @@ #include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "smm.h" #include "kvm_emulate.h" #include "cpuid.h" #include "spte.h" @@ -802,15 +803,31 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - if (sp->lpage_disallowed) + /* + * If it's possible to replace the shadow page with an NX huge page, + * i.e. if the shadow page is the only thing currently preventing KVM + * from using a huge page, add the shadow page to the list of "to be + * zapped for NX recovery" pages. Note, the shadow page can already be + * on the list if KVM is reusing an existing shadow page, i.e. if KVM + * links a shadow page at multiple points. + */ + if (!list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; - list_add_tail(&sp->lpage_disallowed_link, - &kvm->arch.lpage_disallowed_mmu_pages); - sp->lpage_disallowed = true; + list_add_tail(&sp->possible_nx_huge_page_link, + &kvm->arch.possible_nx_huge_pages); +} + +static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool nx_huge_page_possible) +{ + sp->nx_huge_page_disallowed = true; + + if (nx_huge_page_possible) + track_possible_nx_huge_page(kvm, sp); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -830,11 +847,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + if (list_empty(&sp->possible_nx_huge_page_link)) + return; + --kvm->stat.nx_lpage_splits; - sp->lpage_disallowed = false; - list_del(&sp->lpage_disallowed_link); + list_del_init(&sp->possible_nx_huge_page_link); +} + +static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + sp->nx_huge_page_disallowed = false; + + untrack_possible_nx_huge_page(kvm, sp); } static struct kvm_memory_slot * @@ -1645,7 +1671,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *pos; u64 *end; - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++) if (is_shadow_present_pte(*pos)) { printk(KERN_ERR "%s: %p %llx\n", __func__, pos, *pos); @@ -1793,7 +1819,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(ent); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -1894,7 +1920,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->role.invalid) return true; - /* TDP MMU pages due not use the MMU generation. */ + /* TDP MMU pages do not use the MMU generation. */ return !sp->tdp_mmu_page && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -2129,6 +2155,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See @@ -2350,7 +2378,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return; @@ -2371,7 +2399,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(pte); drop_parent_pte(child, spte); /* @@ -2487,8 +2515,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, zapped_root = !is_obsolete_sp(kvm, sp); } - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + if (sp->nx_huge_page_disallowed) + unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; @@ -2811,7 +2839,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(pte); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3085,7 +3113,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ if (cur_level > PG_LEVEL_4K && cur_level == fault->goal_level && is_shadow_present_pte(spte) && - !is_large_pte(spte)) { + !is_large_pte(spte) && + spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch) * and __direct_map would like to create a large PTE @@ -3127,9 +3156,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) continue; link_shadow_page(vcpu, it.sptep, sp); - if (fault->is_tdp && fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); + if (fault->huge_page_disallowed) + account_nx_huge_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -3149,8 +3178,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { + if (is_sigpending_pfn(pfn)) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + /* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can @@ -3172,7 +3206,7 @@ static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fau { /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); + return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); if (unlikely(!fault->slot)) { gva_t gva = fault->is_tdp ? 0 : fault->addr; @@ -3423,7 +3457,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); + /* + * The "root" may be a special root, e.g. a PAE entry, treat it as a + * SPTE to ensure any non-PA bits are dropped. + */ + sp = spte_to_child_sp(*root_hpa); if (WARN_ON(!sp)) return; @@ -3908,8 +3946,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= SPTE_BASE_ADDR_MASK; - sp = to_shadow_page(root); + sp = spte_to_child_sp(root); mmu_sync_children(vcpu, sp, true); } } @@ -4170,7 +4207,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } async = false; - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); if (!async) @@ -4187,7 +4224,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } } - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, + /* + * Allow gup to bail on pending non-fatal signals when it's also allowed + * to wait for IO. Note, gup always bails if it is unable to quickly + * get a page and a fatal signal, i.e. SIGKILL, is pending. + */ + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; @@ -5972,7 +6014,7 @@ int kvm_mmu_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); r = kvm_mmu_init_tdp_mmu(kvm); @@ -6657,7 +6699,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + wake_up_process(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } @@ -6789,7 +6831,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + wake_up_process(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } @@ -6797,9 +6839,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_lpages(struct kvm *kvm) +static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; + struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -6820,24 +6863,55 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { - if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + if (list_empty(&kvm->arch.possible_nx_huge_pages)) break; /* * We use a separate list instead of just using active_mmu_pages - * because the number of lpage_disallowed pages is expected to - * be relatively small compared to the total. + * because the number of shadow pages that be replaced with an + * NX huge page is expected to be relatively small compared to + * the total number of shadow pages. And because the TDP MMU + * doesn't use active_mmu_pages. */ - sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, - lpage_disallowed_link); - WARN_ON_ONCE(!sp->lpage_disallowed); - if (is_tdp_mmu_page(sp)) { + possible_nx_huge_page_link); + WARN_ON_ONCE(!sp->nx_huge_page_disallowed); + WARN_ON_ONCE(!sp->role.direct); + + /* + * Unaccount and do not attempt to recover any NX Huge Pages + * that are being dirty tracked, as they would just be faulted + * back in as 4KiB pages. The NX Huge Pages in this slot will be + * recovered, along with all the other huge pages in the slot, + * when dirty logging is disabled. + * + * Since gfn_to_memslot() is relatively expensive, it helps to + * skip it if it the test cannot possibly return true. On the + * other hand, if any memslot has logging enabled, chances are + * good that all of them do, in which case unaccount_nx_huge_page() + * is much cheaper than zapping the page. + * + * If a memslot update is in progress, reading an incorrect value + * of kvm->nr_memslots_dirty_logging is not a problem: if it is + * becoming zero, gfn_to_memslot() will be done unnecessarily; if + * it is becoming nonzero, the page will be zapped unnecessarily. + * Either way, this only affects efficiency in racy situations, + * and not correctness. + */ + slot = NULL; + if (atomic_read(&kvm->nr_memslots_dirty_logging)) { + slot = gfn_to_memslot(kvm, sp->gfn); + WARN_ON_ONCE(!slot); + } + + if (slot && kvm_slot_dirty_track_enabled(slot)) + unaccount_nx_huge_page(kvm, sp); + else if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); - } else { + else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); - } + WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); @@ -6857,7 +6931,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_lpage_recovery_timeout(u64 start_time) +static long get_nx_huge_page_recovery_timeout(u64 start_time) { bool enabled; uint period; @@ -6868,19 +6942,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time) : MAX_SCHEDULE_TIMEOUT; } -static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) { u64 start_time; long remaining_time; while (true) { start_time = get_jiffies_64(); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop() && remaining_time > 0) { schedule_timeout(remaining_time); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); } @@ -6889,7 +6963,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) if (kthread_should_stop()) return 0; - kvm_recover_nx_lpages(kvm); + kvm_recover_nx_huge_pages(kvm); } } @@ -6897,17 +6971,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; - err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", - &kvm->arch.nx_lpage_recovery_thread); + &kvm->arch.nx_huge_page_recovery_thread); if (!err) - kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); return err; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { - if (kvm->arch.nx_lpage_recovery_thread) - kthread_stop(kvm->arch.nx_lpage_recovery_thread); + if (kvm->arch.nx_huge_page_recovery_thread) + kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } |