diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 84 |
1 files changed, 55 insertions, 29 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 813d31038b93..931467881da7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -22,6 +22,7 @@ #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "cpuid.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -595,7 +596,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * we always atomicly update it, see the comments in * spte_has_volatile_bits(). */ - if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) + if (spte_is_locklessly_modifiable(old_spte) && + !is_writable_pte(new_spte)) ret = true; if (!shadow_accessed_mask) @@ -1176,8 +1178,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) /* * Write-protect on the specified @sptep, @pt_protect indicates whether - * spte writ-protection is caused by protecting shadow page table. - * @flush indicates whether tlb need be flushed. + * spte write-protection is caused by protecting shadow page table. * * Note: write protection is difference between drity logging and spte * protection: @@ -1186,10 +1187,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * - for spte protection, the spte can be writable only after unsync-ing * shadow page. * - * Return true if the spte is dropped. + * Return true if tlb need be flushed. */ -static bool -spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) { u64 spte = *sptep; @@ -1199,17 +1199,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - if (__drop_large_spte(kvm, sptep)) { - *flush |= true; - return true; - } - if (pt_protect) spte &= ~SPTE_MMU_WRITEABLE; spte = spte & ~PT_WRITABLE_MASK; - *flush |= mmu_spte_update(sptep, spte); - return false; + return mmu_spte_update(sptep, spte); } static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, @@ -1221,11 +1215,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { - sptep = rmap_get_first(*rmapp, &iter); - continue; - } + flush |= spte_write_protect(kvm, sptep, pt_protect); sptep = rmap_get_next(&iter); } @@ -2802,9 +2793,9 @@ static bool page_fault_can_be_fast(u32 error_code) } static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *sptep, u64 spte) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); gfn_t gfn; WARN_ON(!sp->role.direct); @@ -2830,6 +2821,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, u32 error_code) { struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; bool ret = false; u64 spte = 0ull; @@ -2853,7 +2845,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, goto exit; } - if (!is_last_spte(spte, level)) + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) goto exit; /* @@ -2875,11 +2868,24 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, goto exit; /* + * Do not fix write-permission on the large spte since we only dirty + * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() + * that means other pages are missed if its slot is dirty-logged. + * + * Instead, we let the slow page fault path create a normal spte to + * fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + goto exit; + + /* * Currently, fast page fault only works for direct mapping since * the gfn is not stable for indirect shadow page. * See Documentation/virtual/kvm/locking.txt to get more detail. */ - ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); + ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); exit: trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, spte, ret); @@ -3511,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, { int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + u64 gbpages_bit_rsvd = 0; context->bad_mt_xwr = 0; if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); + if (!guest_cpuid_has_gbpages(vcpu)) + gbpages_bit_rsvd = rsvd_bits(7, 7); switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ @@ -3538,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | - rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ context->rsvd_bits_mask[0][0] = exb_bit_rsvd | @@ -3550,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; context->rsvd_bits_mask[1][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | @@ -4304,15 +4313,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (*rmapp) __rmap_write_protect(kvm, rmapp, false); - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_flush_remote_tlbs(kvm); + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); - } } } - kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + + /* + * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() + * which do tlb flush out of mmu-lock should be serialized by + * kvm->slots_lock otherwise tlb flush would be missed. + */ + lockdep_assert_held(&kvm->slots_lock); + + /* + * We can flush all the TLBs out of the mmu lock without TLB + * corruption since we just change the spte from writable to + * readonly so that we only need to care the case of changing + * spte from present to present (changing the spte from present + * to nonpresent will flush all the TLBs immediately), in other + * words, the only case we care is mmu_spte_update() where we + * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE + * instead of PT_WRITABLE_MASK, that means it does not depend + * on PT_WRITABLE_MASK anymore. + */ + kvm_flush_remote_tlbs(kvm); } #define BATCH_ZAP_PAGES 10 |