diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 351 |
1 files changed, 287 insertions, 64 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f83fc6c5e0ba..cee759299a35 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -63,30 +63,16 @@ enum { #undef MMU_DEBUG #ifdef MMU_DEBUG +static bool dbg = 0; +module_param(dbg, bool, 0644); #define pgprintk(x...) do { if (dbg) printk(x); } while (0) #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - +#define MMU_WARN_ON(x) WARN_ON(x) #else - #define pgprintk(x...) do { } while (0) #define rmap_printk(x...) do { } while (0) - -#endif - -#ifdef MMU_DEBUG -static bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } +#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 @@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) return (old_spte & bit_mask) && !(new_spte & bit_mask); } +static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) != (new_spte & bit_mask); +} + /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) if (!shadow_accessed_mask) return ret; + /* + * Flush TLB when accessed/dirty bits are changed in the page tables, + * to guarantee consistency between TLB and page tables. + */ + if (spte_is_bit_changed(old_spte, new_spte, + shadow_accessed_mask | shadow_dirty_mask)) + ret = true; + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) @@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, return flush; } +static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) +{ + u64 spte = *sptep; + + rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + + spte &= ~shadow_dirty_mask; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_clear_dirty(kvm, sptep); + sptep = rmap_get_next(&iter); + } + + return flush; +} + +static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) +{ + u64 spte = *sptep; + + rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + + spte |= shadow_dirty_mask; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_set_dirty(kvm, sptep); + sptep = rmap_get_next(&iter); + } + + return flush; +} + /** * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages * @kvm: kvm instance @@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, * Used when we do not need to care about huge page mappings: e.g. during dirty * logging we do not have any such mappings. */ -void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { @@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } +/** + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * @kvm: kvm instance + * @slot: slot to clear D-bit + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should clear D-bit + * + * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. + */ +void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + unsigned long *rmapp; + + while (mask) { + rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_clear_dirty(kvm, rmapp); + + /* clear the first set bit */ + mask &= mask - 1; + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); + +/** + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * PT level pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + if (kvm_x86_ops->enable_log_dirty_pt_masked) + kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, + mask); + else + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; @@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { - ASSERT(is_empty_shadow_page(sp->spt)); + MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); @@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } - if (pte_access & ACC_WRITE_MASK) + if (pte_access & ACC_WRITE_MASK) { mark_page_dirty(vcpu->kvm, gfn); + spte |= shadow_dirty_mask; + } set_pte: if (mmu_spte_update(sptep, spte)) @@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, */ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + /* + * Theoretically we could also set dirty bit (and flush TLB) here in + * order to eliminate unnecessary PML logging. See comments in + * set_spte. But fast_page_fault is very unlikely to happen with PML + * enabled, so we do not do this. This might result in the same GPA + * to be logged in PML buffer again when the write really happens, and + * eventually to be called by mark_page_dirty twice. But it's also no + * harm. This also avoids the TLB flush needed after setting dirty bit + * so non-PML cases won't be impacted. + * + * Compare with set_spte where instead shadow_dirty_mask is set. + */ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) mark_page_dirty(vcpu->kvm, gfn); @@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), @@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); @@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); if (!is_present_gpte(pdptr)) { @@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, if (r) return r; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); gfn = gva >> PAGE_SHIFT; @@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, int write = error_code & PFERR_WRITE_MASK; bool map_writable; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, gpa, error_code, true); @@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - ASSERT(is_pae(vcpu)); + MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; @@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.walk_mmu; + struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; context->page_fault = tdp_page_fault; @@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_last_pte_bitmap(vcpu, context); } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) else paging32_init_context(vcpu, context); - vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); - vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); - vcpu->arch.mmu.base_role.smep_andnot_wp + context->base_role.nxe = is_nx(vcpu); + context->base_role.cr4_pae = !!is_pae(vcpu); + context->base_role.cr0_wp = is_write_protection(vcpu); + context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); context->shadow_root_level = kvm_x86_ops->get_tdp_level(); @@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; - vcpu->arch.walk_mmu->get_cr3 = get_cr3; - vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + struct kvm_mmu *context = &vcpu->arch.mmu; + + kvm_init_shadow_mmu(vcpu); + context->set_cr3 = kvm_x86_ops->set_cr3; + context->get_cr3 = get_cr3; + context->get_pdptr = kvm_pdptr_read; + context->inject_page_fault = kvm_inject_page_fault; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) @@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) - return init_kvm_nested_mmu(vcpu); + init_kvm_nested_mmu(vcpu); else if (tdp_enabled) - return init_kvm_tdp_mmu(vcpu); + init_kvm_tdp_mmu(vcpu); else - return init_kvm_softmmu(vcpu); + init_kvm_softmmu(vcpu); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); init_kvm_mmu(vcpu); } @@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; - ASSERT(vcpu); - /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; @@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) void kvm_mmu_setup(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); init_kvm_mmu(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - struct kvm_memory_slot *memslot; gfn_t last_gfn; int i; + bool flush = false; - memslot = id_to_memslot(kvm->memslots, slot); last_gfn = memslot->base_gfn + memslot->npages - 1; spin_lock(&kvm->mmu_lock); @@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) - __rmap_write_protect(kvm, rmapp, false); + flush |= __rmap_write_protect(kvm, rmapp, + false); if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); @@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) * instead of PT_WRITABLE_MASK, that means it does not depend * on PT_WRITABLE_MASK anymore. */ - kvm_flush_remote_tlbs(kvm); + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + unsigned long *rmapp; + unsigned long last_index, index; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, + PT_PAGE_TABLE_LEVEL); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_clear_dirty(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + + spin_unlock(&kvm->mmu_lock); + + lockdep_assert_held(&kvm->slots_lock); + + /* + * It's also safe to flush TLBs out of mmu lock here as currently this + * function is only used for dirty logging, in which case flushing TLB + * out of mmu lock also guarantees no dirty pages will be lost in + * dirty_bitmap. + */ + if (flush) + kvm_flush_remote_tlbs(kvm); +} +EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); + +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + int i; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_write_protect(kvm, rmapp, + false); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + } + spin_unlock(&kvm->mmu_lock); + + /* see kvm_mmu_slot_remove_write_access */ + lockdep_assert_held(&kvm->slots_lock); + + if (flush) + kvm_flush_remote_tlbs(kvm); +} +EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); + +void kvm_mmu_slot_set_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + int i; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_set_dirty(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + + lockdep_assert_held(&kvm->slots_lock); + + /* see kvm_mmu_slot_leaf_clear_dirty */ + if (flush) + kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) @@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); |